def __dump_numpy_txt(): import hashlib def dump_to_dir(dataset, out_dir, dataset_name): if not os.path.exists(os.path.join(out_dir, dataset_name)): os.makedirs(os.path.join(out_dir, dataset_name)) for i, item in enumerate(dataset.data): name = hashlib.sha256('{name}{idx}'.format(name=dataset_name, idx=i).encode()).hexdigest() + ".txt" word_dir = os.path.join(out_dir, dataset_name, dataset.idx2word[i]) if not os.path.exists(word_dir): os.makedirs(word_dir) np.savetxt(os.path.join(word_dir, name), item) start = time.time() train_path, dev_path, test_path = get_dataset_paths(current_dataset, fmt='scp') data_train = KaldiDataset(train_path, noise_multiplier=1.0, noise_prob=0.5, supplement_rare_with_noisy=False, supplement_seed=112) dump_to_dir(data_train, current_dataset, 'train') data_dev = KaldiDataset(dev_path, parent_dataset_path=train_path, training=False) dump_to_dir(data_dev, current_dataset, 'dev') data_test = KaldiDataset(test_path, parent_dataset_path=train_path, training=False) dump_to_dir(data_test, current_dataset, 'test') print('dump: {0}'.format(time.time() - start))
def do_calculate_ap(run_dir, epoch, dataset=None, partition='dev'): run_name2get_embeddings = { 'classifier': get_classifier_embeddings, 'siamese': get_siamese_embeddings } net, config, checkpoints, checkpoint_dir, run_name, loss, train_scp, _, _, _, mean_sub, var_norm = \ load_net(run_dir, epoch=None, logger=None, train=False) get_embeddings = run_name2get_embeddings[run_name] if dataset is None: dataset = current_dataset train_path, dev_path, test_path = get_dataset_paths(dataset) # noinspection PyPep8Naming DatasetClass = get_dataset_class_for_path(train_path, logger=None) if len(checkpoints) == 0: print('No checkpoints found in {0} for run {1}'.format( checkpoint_dir, run_dir)) print('Exiting') sys.exit(-1) if partition == 'train': data_train = DatasetClass(train_path, parent_dataset_path=train_scp, training=False, logger=None, mean_subtraction=mean_sub, variance_normalization=var_norm) return get_epoch_ap(net, config, checkpoints, loss, data_train, epoch, get_embeddings, subsample_size=3000) if partition == 'dev': data_dev = DatasetClass(dev_path, parent_dataset_path=train_scp, training=False, logger=None, mean_subtraction=mean_sub, variance_normalization=var_norm) return get_epoch_ap(net, config, checkpoints, loss, data_dev, epoch, get_embeddings) if partition == 'test': data_test = DatasetClass(test_path, parent_dataset_path=train_scp, training=False, logger=None, mean_subtraction=mean_sub, variance_normalization=var_norm) return get_epoch_ap(net, config, checkpoints, loss, data_test, epoch, get_embeddings)
def split_snodgrass_dataset(source_sub_dir, snodgrass_file, same_split_as=None): """~50% train, ~25% percent dev, ~25% test, while taking care that each patient's data is present exclusively in either train, dev, or test sets.""" snodgrass = os.path.join(processed_data_dir, source_sub_dir, snodgrass_file) lines = [] with open(snodgrass, 'r') as f: lines.extend(f.readlines()) lines = np.array(lines) words = np.array([key2word(x) for x in lines]) patients = np.array([snodgrass_key2patient(x) for x in lines]) patients_with_counts = [(key, value) for key, value in Counter(patients).items()] data_train = [] data_test = [] data_dev = [] words_train = [] words_test = [] words_dev = [] if same_split_as is None: # surprise knapsack problem :) patients_train = knapsack(patients_with_counts, len(lines) / 2)[1] patients_left = remove_all(patients_with_counts, patients_train) patients_test = knapsack(patients_left, len(lines) / 4)[1] patients_dev = remove_all(patients_left, patients_test) else: train_path, dev_path, test_path = get_dataset_paths(same_split_as, fmt='scp') patients_train = scp2snodgrass_patients(train_path) patients_test = scp2snodgrass_patients(test_path) patients_dev = scp2snodgrass_patients(dev_path) for patient, _ in patients_train: data_train.extend(lines[np.where(patients == patient)]) words_train.extend(words[np.where(patients == patient)]) for patient, _ in patients_test: data_test.extend(lines[np.where(patients == patient)]) words_test.extend(words[np.where(patients == patient)]) for patient, _ in patients_dev: data_dev.extend(lines[np.where(patients == patient)]) words_dev.extend(words[np.where(patients == patient)]) print( 'Unique words in train dataset: {0}, in test: {1}, in dev: {2}'.format( len(Counter(words_train)), len(Counter(words_test)), len(Counter(words_dev)))) return data_train, data_dev, data_test
def __dump_lmdb(): from base.data_io.dataset2lmdb import dataset2lmdb start = time.time() train_path, dev_path, test_path = get_dataset_paths(current_dataset, fmt='scp') train_path_lmdb, dev_path_lmdb, test_path_lmdb = get_dataset_paths(current_dataset, fmt='lmdb') data_train = KaldiDataset(train_path, noise_multiplier=1.0, noise_prob=0.5, supplement_rare_with_noisy=False, supplement_seed=112) dataset2lmdb(data_train, train_path_lmdb) data_dev = KaldiDataset(dev_path, parent_dataset_path=train_path, training=False) dataset2lmdb(data_dev, dev_path_lmdb) data_test = KaldiDataset(test_path, parent_dataset_path=train_path, training=False) dataset2lmdb(data_test, test_path_lmdb) print('dump to LMDB: {0}'.format(time.time() - start))
def select_independent_words(): """For the new dataset the words are selected as follows: the emuDB datasets are the source of train data, the SWC dataset is used as the test set (SWC is read speech only, which should somewhat resemble patient speech: slower and more deliberate than typical spontaneous speech). The words are selected randomly from words vaguely matching the train dataset words: same or plus one number of characters, also nouns (detected by capitalization), and with no fewer test set examples than in the original dev set.""" counts_per_emu_db = get_emu_word_counts() counts_emu_total = collapse_nested_dict(counts_per_emu_db) counts_swc = get_swc_word_counts() train_path, dev_path, _ = get_dataset_paths('all_snodgrass_cleaned_v5', fmt='scp') counts_train = get_dataset_word_counts(train_path) counts_dev = get_dataset_word_counts(dev_path) selected_words = {} for word, count in counts_train.items(): num_letters = len(word) emu_similar = {} for emu_word, emu_count in counts_emu_total.items(): if not emu_word.isalpha( ): # skipping words with non-alphabetic characters continue if not emu_word[0].isupper(): # budget noun detector continue if emu_word in selected_words.values(): continue # factor of 1.5 for the original counts to account for the cleaning removing a lot of examples if (len(emu_word) == num_letters or len(emu_word) == num_letters + 1) and emu_count >= count * 1.5: emu_similar[emu_word] = emu_count similar_list = [ x for x in emu_similar.keys() if not any([ y.startswith(x) or x.startswith(y) for y in selected_words.values() ]) ] random.shuffle(similar_list) selected = similar_list[0] if word in counts_dev: # factor of 1.5 for the original counts to account for the cleaning removing a lot of examples while selected not in counts_swc or len( counts_swc[selected]) < counts_dev[word] * 1.5: similar_list = similar_list[1:] selected = similar_list[0] if len(similar_list) == 0: raise RuntimeError() selected_words[word] = selected return selected_words
def __main(): start = time.time() train_path, dev_path, test_path = get_dataset_paths(current_dataset, fmt='scp') data_train = KaldiDataset(train_path, noise_multiplier=1.0, noise_prob=0.5, supplement_rare_with_noisy=False, supplement_seed=112) data_dev = KaldiDataset(dev_path, parent_dataset_path=train_path, training=False) data_test = KaldiDataset(test_path, parent_dataset_path=train_path, training=False) _print_patients(data_train, data_dev, data_test) test = next(data_train.siamese_triplet_epoch(32, augment_parts=True)) test = next(data_train.siamese_margin_loss_epoch(50, 5)) print('scp: {0}'.format(time.time() - start))
def __main_independent_test(): swc_path = '/home/aleks/data/speech_processed/independent_test_v2/SWC_independent_test.scp' data_swc = KaldiDataset(swc_path) print(data_swc.counts) train_path, dev_path, test_path = get_dataset_paths('independent_cleaned_v3', fmt='scp') data_train = KaldiDataset(train_path) data_dev = KaldiDataset(dev_path, parent_dataset_path=train_path) data_test = KaldiDataset(test_path, parent_dataset_path=train_path) print(data_dev.counts) swc_keys = set(data_swc.idx2key) dev_keys = set(data_dev.idx2key) difference = swc_keys.difference(dev_keys) left_words = np.array([key2word(x) for x in difference]) left_counts = Counter(left_words) print(left_counts)
def split_independent_words(output_name, data_sub_dir, dataset_comparable_to): output_path = os.path.join(processed_data_dir, output_name) train_path, dev_path, _ = get_dataset_paths(dataset_comparable_to) counts_train = get_dataset_word_counts(train_path) counts_dev = get_dataset_word_counts(dev_path) selected_words = load_pickled('selected_words.pckl') all_scp = glob.glob( os.path.join(processed_data_dir, data_sub_dir, '*independent_test.scp')) swc_scp = [x for x in all_scp if os.path.basename(x).startswith('SWC')][0] all_scp.remove(swc_scp) emu_lines = [] # this will be the train data swc_lines = [] # this will be the test data for scp in all_scp: read_scp_lines(scp, emu_lines) read_scp_lines(swc_scp, swc_lines) emu_lines = np.array(emu_lines) swc_lines = np.array(swc_lines) emu_words = np.array([key2word(x) for x in emu_lines]) swc_words = np.array([key2word(x) for x in swc_lines]) emu_counts = Counter(emu_words) swc_counts = Counter(swc_words) # for word in emu_counts: # print('{0:<20}: train {1}, test {2}'.format(word, emu_counts[word], swc_counts.get(word, 0))) # for word in counts_train: # new_word = selected_words[word] # print('{0}, train: {1}, dev: {2}'.format(word, counts_train[word], counts_dev.get(word, 0))) # print('{word}: new train count: {0}, new test count: {1}'.format(emu_counts[new_word], swc_counts[new_word], # word=new_word)) new_train = [] new_dev = [] for word, new_word in selected_words.items(): train_new_lines = emu_lines[emu_words == new_word] np.random.shuffle(train_new_lines) new_train.extend(train_new_lines[:counts_train[word]]) dev_new_lines = swc_lines[swc_words == new_word] np.random.shuffle(dev_new_lines) # new_dev.extend(dev_new_lines[:counts_dev.get(word, 5)]) # didn't work at all, maybe bad labels? new_dev.extend(dev_new_lines[:35]) train_scp = '{0}_train.scp'.format(output_path) dev_scp = '{0}_dev.scp'.format(output_path) with open(train_scp, 'w') as train_file: for line in new_train: train_file.write(line) with open(dev_scp, 'w') as dev_file: for line in new_dev: dev_file.write(line) return train_scp, dev_scp, None
# and afterwards I switched the dev and test sets, to make sure the test set is the more complete one swc_lines = [] read_scp_lines(swc_path, swc_lines) dev_lines = [] read_scp_lines(dev_path, dev_lines) left_lines = np.array([x for x in swc_lines if x not in dev_lines]) left_words = np.array([key2word(x) for x in left_lines]) test_lines = [] for word in np.unique(left_words): left_word_lines = left_lines[left_words == word] np.random.shuffle(left_word_lines) test_lines.extend(left_word_lines[:35]) with open(test_path, 'w') as test_file: for line in test_lines: test_file.write(line) if __name__ == '__main__': # train_scp, dev_scp, test_scp = split_independent_words('independent_cleaned_v3', 'independent_test_v2', # 'all_snodgrass_cleaned_v4') # train_data = KaldiDataset(train_scp) # train_data.dump_derived_data() train_path, dev_path, test_path = get_dataset_paths( 'independent_cleaned_v3', fmt='scp') swc_path = '/home/aleks/data/speech_processed/independent_test_v2/SWC_independent_test.scp' compose_test_from_non_validation_words(swc_path, dev_path, test_path)
def __main(): args, config, logger, checkpoint_dir, log_dir, use_gru, noise_mult, noise_prob, mean_sub, var_norm = \ setup_training_run('classifier') supplement_rare = getattr(config.general_training, 'supplement_rare_with_noisy', False) supplement_seed = getattr(config.general_training, 'supplement_seed', 112) train_path, dev_path, _ = get_dataset_paths(current_dataset) # noinspection PyPep8Naming DatasetClass = get_dataset_class_for_path(train_path, logger=logger) data_train = DatasetClass(train_path, logger=logger, noise_multiplier=noise_mult, noise_prob=noise_prob, mean_subtraction=mean_sub, variance_normalization=var_norm, supplement_rare_with_noisy=supplement_rare, supplement_seed=supplement_seed) data_dev = DatasetClass(dev_path, parent_dataset_path=train_path, training=False, logger=logger, mean_subtraction=mean_sub, variance_normalization=var_norm) data_parallel = args.gpu_count > 1 batch_first = data_parallel if not use_gru: net = LSTMClassifier(logger, config, batch_first=batch_first) else: net = GRUClassifier(logger, config, batch_first=batch_first) if args.load_weights is not None: net.restore_weights(args.load_weights, exclude_params=['fc.6'], freeze_except=None) net.train(True) if args.gpu_count > 1: net = torch.nn.DataParallel(net) config.classifier_training.batch_size = config.classifier_training.batch_size * args.gpu_count if config.model.use_cuda: net = net.cuda() optimizer = torch.optim.Adam(net.parameters(), lr=config.classifier_training.learning_rate) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', factor=0.5, patience=7, min_lr=1e-5, verbose=True) # log initial performance level dev_losses, dev_accuracy = process_classifier_epoch(net, config, optimizer, data_dev, batch_first, data_parallel, train=False) logger.info( 'Initial avg dev loss = {0:.4f}, dev accuracy = {1:.4f}'.format( np.mean(dev_losses), dev_accuracy)) for epoch in range(config.classifier_training.train_epochs): logger.info('Starting epoch {0}, learning_rate = {1}'.format( epoch, [group['lr'] for group in optimizer.param_groups][0])) start = time.time() train_losses, train_accuracy = process_classifier_epoch(net, config, optimizer, data_train, batch_first, data_parallel, train=True) dev_losses, dev_accuracy = process_classifier_epoch(net, config, optimizer, data_dev, batch_first, data_parallel, train=False) if config.classifier_training.lr_schedule: scheduler.step(dev_accuracy) torch.save( net.state_dict(), os.path.join( checkpoint_dir, '{0}_epoch_{1}.ckpt'.format(net.__class__.__name__, epoch))) logger.info( 'Finished epoch {0}, avg training loss = {1:.4f}, avg dev loss = {2:.4f} epoch time = {3:.3f} sec' .format(epoch, np.mean(train_losses), np.mean(dev_losses), time.time() - start)) logger.info( 'Epoch {0} training accuracy = {1:.4f}, dev accuracy = {2:.4f}'. format(epoch, train_accuracy, dev_accuracy))
def __main(run_dir, dataset=None, for_epochs=None, gen_train=False, gen_dev=True, gen_test=False): run_name2get_embeddings = { 'classifier': get_classifier_embeddings, 'siamese': get_siamese_embeddings } net, config, checkpoints, checkpoint_dir, run_name, loss, train_scp, _, _, _, mean_sub, var_norm = \ load_net(run_dir, epoch=None, logger=None, train=False) get_embeddings = run_name2get_embeddings[run_name] if dataset is None: dataset = current_dataset train_path, dev_path, test_path = get_dataset_paths(dataset) # noinspection PyPep8Naming DatasetClass = get_dataset_class_for_path(train_path, logger=None) if len(checkpoints) == 0: print('No checkpoints found in {0} for run {1}'.format( checkpoint_dir, run_dir)) print('Exiting') sys.exit(-1) if for_epochs is None: for_epochs = sorted(list(checkpoints.keys())) if gen_train: data_train = DatasetClass(train_path, parent_dataset_path=train_scp, training=False, logger=None, mean_subtraction=mean_sub, variance_normalization=var_norm) _print_ap_per_epoch(net, config, checkpoints, loss, data_train, 'train', for_epochs, get_embeddings, subsample_size=3000) if gen_dev: data_dev = DatasetClass(dev_path, parent_dataset_path=train_scp, training=False, logger=None, mean_subtraction=mean_sub, variance_normalization=var_norm) _print_ap_per_epoch(net, config, checkpoints, loss, data_dev, 'dev', for_epochs, get_embeddings) if gen_test: data_test = DatasetClass(test_path, parent_dataset_path=train_scp, training=False, logger=None, mean_subtraction=mean_sub, variance_normalization=var_norm) _print_ap_per_epoch(net, config, checkpoints, loss, data_test, 'test', for_epochs, get_embeddings)
def __main(): args, config, logger, checkpoint_dir, log_dir, use_gru, noise_mult, noise_prob, mean_sub, var_norm = \ setup_training_run('siamese') supplement_rare = getattr(config.general_training, 'supplement_rare_with_noisy', False) supplement_seed = getattr(config.general_training, 'supplement_seed', 112) train_path, dev_path, _ = get_dataset_paths(current_dataset) # noinspection PyPep8Naming DatasetClass = get_dataset_class_for_path(train_path, logger=logger) data_train = DatasetClass(train_path, logger=logger, noise_multiplier=noise_mult, noise_prob=noise_prob, mean_subtraction=mean_sub, variance_normalization=var_norm, supplement_rare_with_noisy=supplement_rare, supplement_seed=supplement_seed) data_dev = DatasetClass(dev_path, parent_dataset_path=train_path, training=False, logger=logger, mean_subtraction=mean_sub, variance_normalization=var_norm) loss_fn = create_embedding_loss(config, len(data_train.word2id)) data_parallel = args.gpu_count > 1 batch_first = data_parallel if not use_gru: net = SiameseLSTM(logger, config, batch_first=batch_first, loss=loss_fn) else: net = SiameseGRU(logger, config, batch_first=batch_first, loss=loss_fn) if args.load_weights is not None: # exclude_params specifies the layers to drop when applying pre-trained weights net.restore_weights(args.load_weights, exclude_params=['fc.2'], freeze_except=None) # net.restore_weights(args.load_weights, exclude_params=[], freeze_except=None) net.train(True) if args.gpu_count > 1: net = torch.nn.DataParallel(net) config.siamese_training.batch_size = config.siamese_training.batch_size * args.gpu_count if config.model.use_cuda: net = net.cuda() optimizer = create_optimizer(net, config, loss_fn, wrapped=data_parallel) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', factor=0.5, patience=6, min_lr=1e-5, verbose=True) # other settings augment_parts = config.siamese_training.augment_parts if hasattr( config.siamese_training, 'augment_parts') else False # log initial performance level dev_ap = get_ap(net, loss_fn.metric(), optimizer, config, data_dev, batch_first) logger.info('Initial avg dev precision= {0:.4f}'.format(dev_ap)) for epoch in range(config.siamese_training.train_epochs): logger.info('Starting epoch {0}, learning_rate = {1}'.format( epoch, [group['lr'] for group in optimizer.param_groups])) start = time.time() epoch_losses = [] for data, lengths, sample_idx, orig_order in init_batch_generator( config, data_train, loss_fn, augment_parts, batch_first): loss, _ = process_siamese_batch(data, lengths, data_train.classes(sample_idx), orig_order, net, optimizer, config, train=True, data_parallel=data_parallel) del _ gc.collect() epoch_losses.append(loss) dev_ap = get_ap(net, loss_fn.metric(), optimizer, config, data_dev, batch_first) if config.siamese_training.lr_schedule: scheduler.step(dev_ap) torch.save( net.state_dict(), os.path.join( checkpoint_dir, '{0}_epoch_{1}.ckpt'.format(net.__class__.__name__, epoch))) logger.info( 'Finished epoch {0}, average training loss = {1:.4f}, epoch time = {2:.3f} sec' .format(epoch, np.mean(epoch_losses), time.time() - start)) logger.info('Epoch {0} avg dev precision= {1:.4f}'.format( epoch, dev_ap))
def do_calculate_accuracy(run_dir, epoch, is_classifier, dataset=None, partition='dev', return_percent=False): if not is_classifier: train_epoch_embeddings, dev_epoch_embeddings, test_epoch_embeddings = \ get_or_generate_embeddings(run_dir, epoch, dataset=dataset, dev_needed=(partition == 'dev'), test_needed=(partition == 'test')) words_train, datasets_train, vecs_train, counts_train, word_idxs_train = load_embeddings( train_epoch_embeddings[epoch], data_name='train') if partition == 'dev': words_part, datasets_part, vecs_part, counts_part, word_idxs_part = load_embeddings( dev_epoch_embeddings[epoch]) elif partition == 'test': words_part, datasets_part, vecs_part, counts_part, word_idxs_part = load_embeddings( test_epoch_embeddings[epoch], data_name='test') else: raise RuntimeError( 'Cannot calculate accuracy for partition {0}'.format( partition)) test_word_idxs = word_idxs_part test_vecs = vecs_part all_words = set() all_words.update(word_idxs_train.keys()) all_words.update(test_word_idxs.keys()) max_reference_examples = 40 train_x = [] train_y = [] test_x = [] test_y = [] for i, word in enumerate(all_words): if word in word_idxs_train: all_word_idxs = word_idxs_train[word] n_train = min(int(np.floor(all_word_idxs.shape[0] / 2)), max_reference_examples) train_x.extend(vecs_train[all_word_idxs[:n_train]]) train_y.extend([i] * n_train) if word_idxs_train != test_word_idxs: if word in test_word_idxs: test_dat = test_vecs[test_word_idxs[word]] test_x.extend(test_dat) test_y.extend([i] * test_dat.shape[0]) else: test_dat = vecs_train[all_word_idxs[n_train:]] test_x.extend(test_dat) test_y.extend([i] * test_dat.shape[0]) elif word in test_word_idxs: all_word_idxs = test_word_idxs[word] n_train = min(int(np.floor(all_word_idxs.shape[0] / 2)), max_reference_examples) train_x.extend(test_vecs[all_word_idxs[:n_train]]) train_y.extend([i] * n_train) test_dat = test_vecs[all_word_idxs[n_train:]] test_x.extend(test_dat) test_y.extend([i] * test_dat.shape[0]) loss_name = read_embedding_loss(run_dir, throw=False) metric = loss_name2class[loss_name].metric( None) if loss_name is not None else 'cosine' print(run_dir, metric) print('N train: {0}, N test: {1}'.format(len(train_x), len(test_x))) knn = KNeighborsClassifier(n_neighbors=3, metric=metric, n_jobs=8) knn.fit(train_x, train_y) k_pred_y = knn.predict(test_x) if not return_percent: return np.sum(k_pred_y == test_y) / len(test_y) else: return np.sum(k_pred_y == test_y) / len(test_y) * 100.0 else: net, config, checkpoints, checkpoint_dir, run_name, loss, train_scp, _, _, _, mean_sub, var_norm = \ load_net(run_dir, epoch=epoch, logger=None, train=False) if dataset is None: dataset = current_dataset train_path, dev_path, test_path = get_dataset_paths(dataset) # noinspection PyPep8Naming DatasetClass = get_dataset_class_for_path(train_path, logger=None) if partition == 'train': dataset = DatasetClass(train_path, parent_dataset_path=train_scp, training=False, logger=None, mean_subtraction=mean_sub, variance_normalization=var_norm) if partition == 'dev': dataset = DatasetClass(dev_path, parent_dataset_path=train_scp, training=False, logger=None, mean_subtraction=mean_sub, variance_normalization=var_norm) if partition == 'test': dataset = DatasetClass(test_path, parent_dataset_path=train_scp, training=False, logger=None, mean_subtraction=mean_sub, variance_normalization=var_norm) # TODO: no automatic detection for batch_first and data_parallel losses, accuracy = process_classifier_epoch(net, config, optimizer=None, dataset=dataset, batch_first=False, data_parallel=False, train=False) if not return_percent: return accuracy / 100.0 else: return accuracy
def generate_embeddings(run_dir, dataset=None, gen_train=False, gen_dev=False, gen_test=False, gen_new=False, gen_background=False, for_epochs=None): run_name2emb = { 'classifier': get_classifier_embeddings, 'siamese': get_siamese_embeddings } net, config, checkpoints, checkpoint_dir, run_name, loss, train_scp, _, _, _, mean_sub, var_norm = \ load_net(run_dir, epoch=None, logger=None, train=False) get_embeddings = run_name2emb[run_name] # XXX: currently if embeddings exists in train/dev/test folder they are not regenerated, # remove manually when switching datasets if dataset is None: dataset = current_dataset train_path, dev_path, test_path = get_dataset_paths(dataset) # noinspection PyPep8Naming DatasetClass = get_dataset_class_for_path(train_path, logger=None) if gen_train: data_train = DatasetClass(train_path, parent_dataset_path=train_scp, training=False, logger=None, mean_subtraction=mean_sub, variance_normalization=var_norm) if gen_dev: data_dev = DatasetClass(dev_path, parent_dataset_path=train_scp, training=False, logger=None, mean_subtraction=mean_sub, variance_normalization=var_norm) if gen_test: data_test = DatasetClass(test_path, parent_dataset_path=train_scp, training=False, logger=None, mean_subtraction=mean_sub, variance_normalization=var_norm) if gen_new: data_new = DatasetClass(new_path, parent_dataset_path=train_scp, training=False, logger=None, mean_subtraction=mean_sub, variance_normalization=var_norm) if gen_background: background_path = os.path.join(processed_data_dir, 'background_train_v4', 'background_data.scp') data_background = DatasetClass(background_path, parent_dataset_path=train_scp, training=False, logger=None, mean_subtraction=mean_sub, variance_normalization=var_norm) train_embeddings_dir = os.path.join(run_dir, 'train_embeddings') dev_embeddings_dir = os.path.join(run_dir, 'dev_embeddings') test_embeddings_dir = os.path.join(run_dir, 'test_embeddings') new_embeddings_dir = os.path.join(run_dir, 'new_embeddings') background_embeddings_dir = os.path.join(run_dir, 'background_embeddings') if len(checkpoints) == 0: print('No checkpoints found in {0} for run {1}'.format( checkpoint_dir, run_dir)) print('Exiting') sys.exit(-1) if for_epochs is None: for_epochs = sorted(list(checkpoints.keys())) for epoch in for_epochs: checkpoint = checkpoints[epoch] if gen_train: gen_and_save_dataset_embeddings(net, config, checkpoint, data_train, get_embeddings, train_embeddings_dir) if gen_dev: gen_and_save_dataset_embeddings(net, config, checkpoint, data_dev, get_embeddings, dev_embeddings_dir) if gen_test: gen_and_save_dataset_embeddings(net, config, checkpoint, data_test, get_embeddings, test_embeddings_dir) if gen_new: gen_and_save_dataset_embeddings(net, config, checkpoint, data_new, get_embeddings, new_embeddings_dir) if gen_background: gen_and_save_dataset_embeddings(net, config, checkpoint, data_background, get_embeddings, background_embeddings_dir)