def diarization_experiment(model_args, training_args, inference_args, isLoaded=True): """Experiment pipeline. Load data --> train model --> test model --> output result Args: model_args: model configurations training_args: training configurations inference_args: inference configurations """ predicted_labels = [] test_record = [] train_data = np.load('./ghostvlad/training_data_100.npz', allow_pickle=True) train_sequence = train_data['train_sequence'] train_cluster_id = train_data['train_cluster_id'] train_sequence_list = [seq.astype(float) + 1e-5 for seq in train_sequence] train_cluster_id_list = [np.array(cid).astype(str) for cid in train_cluster_id] test_sequences = train_sequence_list[-2:-1] test_cluster_ids = [e.tolist() for e in train_cluster_id_list[-2:-1]] model = uisrnn.UISRNN(model_args) if not isLoaded: # training model.fit(train_sequence_list, train_cluster_id_list, training_args) model.save(SAVED_MODEL_NAME) else: # testing # we can also skip training by calling: model.load(SAVED_MODEL_NAME) for (test_sequence, test_cluster_id) in zip(test_sequences, test_cluster_ids): predicted_label = model.predict(test_sequence, inference_args) predicted_labels.append(predicted_label) accuracy = uisrnn.compute_sequence_match_accuracy( test_cluster_id, predicted_label) test_record.append((accuracy, len(test_cluster_id))) print('Ground truth labels:') print(test_cluster_id) print('Predicted labels:') print(predicted_label) print('-' * 80) output_string = uisrnn.output_result(model_args, training_args, test_record) print('Finished diarization experiment') print(output_string)
def diarization_experiment(model_args, training_args, inference_args): """Experiment pipeline. Load data --> train model --> test model --> output result Args: model_args: model configurations training_args: training configurations inference_args: inference configurations """ predicted_cluster_ids = [] test_record = [] train_data = np.load('./data/toy_training_data.npz', allow_pickle=True) test_data = np.load('./data/toy_testing_data.npz', allow_pickle=True) train_sequence = train_data['train_sequence'] train_cluster_id = train_data['train_cluster_id'] test_sequences = test_data['test_sequences'].tolist() test_cluster_ids = test_data['test_cluster_ids'].tolist() model = uisrnn.UISRNN(model_args) # Training. # If we have saved a mode previously, we can also skip training by # calling: # model.load(SAVED_MODEL_NAME) model.fit(train_sequence, train_cluster_id, training_args) model.save(SAVED_MODEL_NAME) # Testing. # You can also try uisrnn.parallel_predict to speed up with GPU. # But that is a beta feature which is not thoroughly tested, so # proceed with caution. for (test_sequence, test_cluster_id) in zip(test_sequences, test_cluster_ids): predicted_cluster_id = model.predict(test_sequence, inference_args) predicted_cluster_ids.append(predicted_cluster_id) accuracy = uisrnn.compute_sequence_match_accuracy( test_cluster_id, predicted_cluster_id) test_record.append((accuracy, len(test_cluster_id))) print('Ground truth labels:') print(test_cluster_id) print('Predicted labels:') print(predicted_cluster_id) print('-' * 80) output_string = uisrnn.output_result(model_args, training_args, test_record) print('Finished diarization experiment') print(output_string)
def diarization_experiment(model_args, training_args, inference_args): """Experiment pipeline. Load data --> train model --> test model --> output result Args: model_args: model configurations training_args: training configurations inference_args: inference configurations """ predicted_cluster_ids = [] test_record = [] train_data = np.load('./data/toy_training_data.npz') test_data = np.load('./data/toy_testing_data.npz') train_sequence = train_data['train_sequence'] train_cluster_id = train_data['train_cluster_id'] test_sequences = test_data['test_sequences'].tolist() test_cluster_ids = test_data['test_cluster_ids'].tolist() model = uisrnn.UISRNN(model_args) # training model.fit(train_sequence, train_cluster_id, training_args) model.save(SAVED_MODEL_NAME) # we can also skip training by calling: # model.load(SAVED_MODEL_NAME) # testing for (test_sequence, test_cluster_id) in zip(test_sequences, test_cluster_ids): predicted_cluster_id = model.predict(test_sequence, inference_args) predicted_cluster_ids.append(predicted_cluster_id) accuracy = uisrnn.compute_sequence_match_accuracy( test_cluster_id, predicted_cluster_id) test_record.append((accuracy, len(test_cluster_id))) print('Ground truth labels:') print(test_cluster_id) print('Predicted labels:') print(predicted_cluster_id) print('-' * 80) output_string = uisrnn.output_result(model_args, training_args, test_record) print('Finished diarization experiment') print(output_string)
def diarization_experiment(model_args, training_args, inference_args): """Experiment pipeline. Load data --> train model --> test model --> output result Args: model_args: model configurations training_args: training configurations inference_args: inference configurations """ # data loading train_data = np.load('./data/toy_training_data.npz', allow_pickle=True) test_data = np.load('./data/toy_testing_data.npz', allow_pickle=True) train_sequence = train_data['train_sequence'] train_cluster_id = train_data['train_cluster_id'] test_sequences = test_data['test_sequences'].tolist() test_cluster_ids = test_data['test_cluster_ids'].tolist() # model init model = uisrnn.UISRNN(model_args) # model.load(SAVED_MODEL_NAME) # to load a checkpoint # tensorboard writer init writer = SummaryWriter() # training for epoch in range(training_args.epochs): stats = model.fit(train_sequence, train_cluster_id, training_args) # add to tensorboard for loss, cur_iter in stats: for loss_name, loss_value in loss.items(): writer.add_scalar('loss/' + loss_name, loss_value, cur_iter) # save the mdoel model.save(SAVED_MODEL_NAME) # testing predicted_cluster_ids = [] test_record = [] # predict sequences in parallel model.rnn_model.share_memory() pool = mp.Pool(NUM_WORKERS, maxtasksperchild=None) pred_gen = pool.imap( func=partial(model.predict, args=inference_args), iterable=test_sequences) # collect and score predicitons for idx, predicted_cluster_id in enumerate(pred_gen): accuracy = uisrnn.compute_sequence_match_accuracy( test_cluster_ids[idx], predicted_cluster_id) predicted_cluster_ids.append(predicted_cluster_id) test_record.append((accuracy, len(test_cluster_ids[idx]))) print('Ground truth labels:') print(test_cluster_ids[idx]) print('Predicted labels:') print(predicted_cluster_id) print('-' * 80) # close multiprocessing pool pool.close() # close tensorboard writer writer.close() print('Finished diarization experiment') print(uisrnn.output_result(model_args, training_args, test_record))
def run_experiment(train_sequence, train_cluster_id, test_sequence, test_cluster_id, model_args, training_args, inference_args, exp_name): start = datetime.now() if training_args.debug: print('\n\n===== DEBUG MODE =====\n\n') def debug(m): if training_args.debug: print(m) # Create model class model = uisrnn.UISRNN(model_args) print('{} - Created {} model with {:,} params:'.format( datetime.now() - start, model.__class__.__name__, count_parameters(model.rnn_model))) print(model.rnn_model) # Training model_loc = os.path.join(training_args.out_dir, exp_name) model_constructed = (not training_args.overwrite) \ and os.path.exists(model_loc) if model_constructed: try: model.load(model_loc) print('{} - Loaded trained model from {}'.format( datetime.now() - start, model_loc, )) except Exception as e: print('Unable to load model from {}:\n{}'.format(model_loc, e)) model_constructed = False if not model_constructed: model.fit(train_sequence, train_cluster_id, training_args) print('{} - Trained model!'.format(datetime.now() - start)) model.save(model_loc) print('{} - Saved model to {}'.format(datetime.now() - start, model_loc)) # Testing predicted_cluster_ids = [] test_record = [] with torch.no_grad(): for i, (test_seq, test_cluster) in tqdm(enumerate( zip(test_sequence, test_cluster_id)), total=len(test_cluster_id)): debug('Test seq ({}) shape: {}'.format(test_seq.__class__.__name__, test_seq.shape)) debug('Test cluster ({}): {}'.format( test_cluster.__class__.__name__, test_cluster)) predicted_cluster_id = model.predict(test_seq, inference_args) debug('Predicted cluster ID: {}, class {}'.format( predicted_cluster_id, predicted_cluster_id.__class__.__name__)) predicted_cluster_ids.append(predicted_cluster_id) accuracy = uisrnn.compute_sequence_match_accuracy( test_cluster.tolist(), predicted_cluster_id) # We are getting accuracy per batch test_record.append((accuracy, len(test_cluster))) debug('Gold labels: {}'.format(list(test_cluster))) debug('Pred labels: {}'.format(list(predicted_cluster_id))) debug('-' * 80) # Output output_string = uisrnn.output_result(model_args, training_args, test_record) print('Finished diarization experiment') print(output_string) with open( os.path.join(training_args.out_dir, '{}_test.pkl'.format(exp_name)), 'wb') as wf: pickle.dump(test_record, wf) accuracy_array, _ = zip(*test_record) exp_accuracy = np.mean(accuracy_array) return exp_accuracy
def diarization_experiment(model_args, training_args, inference_args): """Experiment pipeline. Load data --> train model --> test model --> output result Args: model_args: model configurations training_args: training configurations inference_args: inference configurations """ predicted_cluster_ids = [] test_record = [] # train_data = np.load('./data/toy_training_data.npz') # test_data = np.load('./data/toy_testing_data.npz') # train_sequence = train_data['train_sequence'] # train_cluster_id = train_data['train_cluster_id'] # test_sequences = test_data['test_sequences'].tolist() # test_cluster_ids = test_data['test_cluster_ids'].tolist() orig_train_sequences = np.load('data/train_sequence.npy').astype( np.float64) orig_train_cluster_ids = np.array(np.load('data/train_cluster_id.npy')) orig_test_sequences = np.load('data/test_sequence.npy').astype(np.float64) orig_test_cluster_ids = np.array(np.load('data/test_cluster_id.npy')) print(orig_test_sequences.shape) print(orig_test_cluster_ids.shape) orig_test_sequences = orig_test_sequences[:orig_test_sequences.shape[0] // 100] orig_test_cluster_ids = orig_test_cluster_ids[:orig_test_cluster_ids. shape[0] // 100] print(orig_test_sequences.shape) print(orig_test_cluster_ids.shape) test_chunk_size = orig_test_sequences.shape[0] // 86 test_left_over = orig_test_sequences.shape[0] % test_chunk_size test_new_len = orig_test_sequences.shape[0] - test_left_over test_sequences = np.split(orig_test_sequences[:test_new_len], test_chunk_size) test_cluster_ids = np.split(orig_test_cluster_ids[:test_new_len], test_chunk_size) model = uisrnn.UISRNN(model_args) # train_sequences = np.array(train_sequences) # train_cluster_ids = np.array(train_cluster_ids) # d = vars(training_args) # # training # for i in range(train_sequences.shape[0]): # train_sequence = train_sequences[i] # train_cluster_id = train_cluster_ids[i] # train_cluster_id = train_cluster_id.tolist() # d['learning_rate'] = 1e-3 # model.fit(train_sequence, train_cluster_id, training_args) # # Take care of leftovers # train_sequence = orig_train_sequences[train_new_len:] # train_cluster_id = orig_train_cluster_id[train_new_len:] # d['learning_rate'] = 1e-3 # model.fit(train_sequence, train_cluster_id, training_args) # model.save(SAVED_MODEL_NAME) # we can also skip training by calling: model.load(SAVED_MODEL_NAME) # testing # Take care of leftover # test_sequence = orig_test_sequences[test_new_len:] # test_cluster_id = orig_test_cluster_ids[test_new_len:].tolist() # predicted_cluster_id = model.predict(test_sequence, inference_args) # predicted_cluster_ids.append(predicted_cluster_id) # accuracy = uisrnn.compute_sequence_match_accuracy( # test_cluster_id, predicted_cluster_id) # test_record.append((accuracy, len(test_cluster_id))) # print('Ground truth labels:') # print(test_cluster_id) # print('Predicted labels:') # print(predicted_cluster_id) # print('-' * 80) # Then the rest for (test_sequence, test_cluster_id) in zip(test_sequences, test_cluster_ids): #print(test_sequence.shape) #print(test_cluster_id) #assert 1 == 2 test_cluster_id = test_cluster_id.tolist() predicted_cluster_id = model.predict(test_sequence, inference_args) predicted_cluster_ids.append(predicted_cluster_id) accuracy = uisrnn.compute_sequence_match_accuracy( test_cluster_id, predicted_cluster_id) test_record.append((accuracy, len(test_cluster_id))) print('Ground truth labels:') print(test_cluster_id) print('Predicted labels:') print(predicted_cluster_id) print('-' * 80) output_string = uisrnn.output_result(model_args, training_args, test_record) print('Finished diarization experiment') print(output_string)
def diarization_experiment(model_args, training_args, inference_args): """Experiment pipeline. Load data --> train model --> test model --> output result Args: model_args: model configurations training_args: training configurations inference_args: inference configurations """ predicted_labels = [] test_record = [] # edited by renni # handle ValueError allow pickle #train_data = np.load('./ghostvlad/training_data.npz') # save np.load np_load_old = np.load # modify the default parameters of np.load np.load = lambda *a, **k: np_load_old(*a, allow_pickle=True, **k) # call load with allow_pickle implicitly set to true train_data = np.load('./ghostvlad/training_data.npz') # restore np.load for future normal usage np.load = np_load_old # end of renni train_sequence = train_data['train_sequence'] train_cluster_id = train_data['train_cluster_id'] train_sequence_list = [ seq.astype(float) + 0.00001 for seq in train_sequence ] train_cluster_id_list = [ np.array(cid).astype(str) for cid in train_cluster_id ] model = uisrnn.UISRNN(model_args) # training #model.fit(train_sequence_list, train_cluster_id_list, training_args) #model.save(SAVED_MODEL_NAME) # testing # we can also skip training by calling: model.load(SAVED_MODEL_NAME) for (test_sequence, test_cluster_id) in zip(test_sequences, test_cluster_ids): predicted_label = model.predict(test_sequence, inference_args) predicted_labels.append(predicted_label) accuracy = uisrnn.compute_sequence_match_accuracy( test_cluster_id, predicted_label) test_record.append((accuracy, len(test_cluster_id))) print('Ground truth labels:') print(test_cluster_id) print('Predicted labels:') print(predicted_label) print('-' * 80) output_string = uisrnn.output_result(model_args, training_args, test_record) print('Finished diarization experiment') print(output_string)