def split_snodgrass_dataset(source_sub_dir, snodgrass_file, same_split_as=None): """~50% train, ~25% percent dev, ~25% test, while taking care that each patient's data is present exclusively in either train, dev, or test sets.""" snodgrass = os.path.join(processed_data_dir, source_sub_dir, snodgrass_file) lines = [] with open(snodgrass, 'r') as f: lines.extend(f.readlines()) lines = np.array(lines) words = np.array([key2word(x) for x in lines]) patients = np.array([snodgrass_key2patient(x) for x in lines]) patients_with_counts = [(key, value) for key, value in Counter(patients).items()] data_train = [] data_test = [] data_dev = [] words_train = [] words_test = [] words_dev = [] if same_split_as is None: # surprise knapsack problem :) patients_train = knapsack(patients_with_counts, len(lines) / 2)[1] patients_left = remove_all(patients_with_counts, patients_train) patients_test = knapsack(patients_left, len(lines) / 4)[1] patients_dev = remove_all(patients_left, patients_test) else: train_path, dev_path, test_path = get_dataset_paths(same_split_as, fmt='scp') patients_train = scp2snodgrass_patients(train_path) patients_test = scp2snodgrass_patients(test_path) patients_dev = scp2snodgrass_patients(dev_path) for patient, _ in patients_train: data_train.extend(lines[np.where(patients == patient)]) words_train.extend(words[np.where(patients == patient)]) for patient, _ in patients_test: data_test.extend(lines[np.where(patients == patient)]) words_test.extend(words[np.where(patients == patient)]) for patient, _ in patients_dev: data_dev.extend(lines[np.where(patients == patient)]) words_dev.extend(words[np.where(patients == patient)]) print( 'Unique words in train dataset: {0}, in test: {1}, in dev: {2}'.format( len(Counter(words_train)), len(Counter(words_test)), len(Counter(words_dev)))) return data_train, data_dev, data_test
def split_train_dev_test(output_name, external_sub_dir, snodgrass_sub_dir, snodgrass_file, same_split_as=None): output_path = os.path.join(processed_data_dir, output_name) external_snodgrass = glob.glob( os.path.join(processed_data_dir, external_sub_dir, '*snodgrass_words.scp')) lines = [] for scp in external_snodgrass: with open(scp, 'r') as f: lines.extend(f.readlines()) words = np.array([key2word(line) for line in lines]) datasets = np.array([key2dataset(line) for line in lines]) counts = Counter(words) word_dataset2idx = { key: {dset: [] for dset in np.unique(datasets)} for key in counts } for i in range(len(lines)): word_dataset2idx[words[i]][datasets[i]].append(i) idx_train = [] idx_dev = [] idx_train.extend(range(len(lines))) snodgrass_train, snodgrass_dev, snodgrass_test = split_snodgrass_dataset( snodgrass_sub_dir, snodgrass_file, same_split_as=same_split_as) train_scp = '{0}_train.scp'.format(output_path) dev_scp = '{0}_dev.scp'.format(output_path) test_scp = '{0}_test.scp'.format(output_path) with open(train_scp, 'w') as train_file: for idx in idx_train: train_file.write(lines[idx]) for line in snodgrass_train: train_file.write(line) with open(dev_scp, 'w') as dev_file: for idx in idx_dev: dev_file.write(lines[idx]) for line in snodgrass_dev: dev_file.write(line) with open(test_scp, 'w') as test_file: for line in snodgrass_test: test_file.write(line) return train_scp, dev_scp, test_scp
def compose_test_from_non_validation_words(swc_path, dev_path, test_path): # and afterwards I switched the dev and test sets, to make sure the test set is the more complete one swc_lines = [] read_scp_lines(swc_path, swc_lines) dev_lines = [] read_scp_lines(dev_path, dev_lines) left_lines = np.array([x for x in swc_lines if x not in dev_lines]) left_words = np.array([key2word(x) for x in left_lines]) test_lines = [] for word in np.unique(left_words): left_word_lines = left_lines[left_words == word] np.random.shuffle(left_word_lines) test_lines.extend(left_word_lines[:35]) with open(test_path, 'w') as test_file: for line in test_lines: test_file.write(line)
def load_embeddings(path, data_name='dev', return_keys=False): with open(path, 'rb') as f: data_dict = pickle.load(f) words = np.array([key2word(key) for key in data_dict]) datasets = np.array([key2dataset(key) for key in data_dict]) datasets[datasets == 'PHATTSESSIONZ'] = 'PHA' vecs = np.array([vec for vec in data_dict.values()]) counts = Counter(words) word_idxs = {key: np.where(words == key)[0] for key in counts} print('There are {0} unique words in the {1} set.'.format( len(counts), data_name)) if not return_keys: return words, datasets, vecs, counts, word_idxs else: return words, datasets, vecs, counts, word_idxs, np.array( list(data_dict.keys()))
def collect_session_embeddings_data(session, vecs_dev, keys_dev): sessions_vecs = [] session_keys = [] for i, key in enumerate(keys_dev): if key2dataset(key) == 'snodgrass' and snodgrass_key2date( key) == session: sessions_vecs.append(vecs_dev[i]) session_keys.append(key) sessions_vecs = np.array(sessions_vecs) session_keys = np.array(session_keys) session_words = np.array([key2word(key) for key in session_keys]) session_word_idxs = { key: np.where(session_words == key)[0] for key in np.unique(session_words) } return sessions_vecs, session_word_idxs
def __main_independent_test(): swc_path = '/home/aleks/data/speech_processed/independent_test_v2/SWC_independent_test.scp' data_swc = KaldiDataset(swc_path) print(data_swc.counts) train_path, dev_path, test_path = get_dataset_paths('independent_cleaned_v3', fmt='scp') data_train = KaldiDataset(train_path) data_dev = KaldiDataset(dev_path, parent_dataset_path=train_path) data_test = KaldiDataset(test_path, parent_dataset_path=train_path) print(data_dev.counts) swc_keys = set(data_swc.idx2key) dev_keys = set(data_dev.idx2key) difference = swc_keys.difference(dev_keys) left_words = np.array([key2word(x) for x in difference]) left_counts = Counter(left_words) print(left_counts)
def split_independent_words(output_name, data_sub_dir, dataset_comparable_to): output_path = os.path.join(processed_data_dir, output_name) train_path, dev_path, _ = get_dataset_paths(dataset_comparable_to) counts_train = get_dataset_word_counts(train_path) counts_dev = get_dataset_word_counts(dev_path) selected_words = load_pickled('selected_words.pckl') all_scp = glob.glob( os.path.join(processed_data_dir, data_sub_dir, '*independent_test.scp')) swc_scp = [x for x in all_scp if os.path.basename(x).startswith('SWC')][0] all_scp.remove(swc_scp) emu_lines = [] # this will be the train data swc_lines = [] # this will be the test data for scp in all_scp: read_scp_lines(scp, emu_lines) read_scp_lines(swc_scp, swc_lines) emu_lines = np.array(emu_lines) swc_lines = np.array(swc_lines) emu_words = np.array([key2word(x) for x in emu_lines]) swc_words = np.array([key2word(x) for x in swc_lines]) emu_counts = Counter(emu_words) swc_counts = Counter(swc_words) # for word in emu_counts: # print('{0:<20}: train {1}, test {2}'.format(word, emu_counts[word], swc_counts.get(word, 0))) # for word in counts_train: # new_word = selected_words[word] # print('{0}, train: {1}, dev: {2}'.format(word, counts_train[word], counts_dev.get(word, 0))) # print('{word}: new train count: {0}, new test count: {1}'.format(emu_counts[new_word], swc_counts[new_word], # word=new_word)) new_train = [] new_dev = [] for word, new_word in selected_words.items(): train_new_lines = emu_lines[emu_words == new_word] np.random.shuffle(train_new_lines) new_train.extend(train_new_lines[:counts_train[word]]) dev_new_lines = swc_lines[swc_words == new_word] np.random.shuffle(dev_new_lines) # new_dev.extend(dev_new_lines[:counts_dev.get(word, 5)]) # didn't work at all, maybe bad labels? new_dev.extend(dev_new_lines[:35]) train_scp = '{0}_train.scp'.format(output_path) dev_scp = '{0}_dev.scp'.format(output_path) with open(train_scp, 'w') as train_file: for line in new_train: train_file.write(line) with open(dev_scp, 'w') as dev_file: for line in new_dev: dev_file.write(line) return train_scp, dev_scp, None
def __init__(self, data_path, parent_dataset_path=None, training=True, logger=None, variance_normalization=False, noise_multiplier=0, noise_prob=1, mean_subtraction=False, supplement_rare_with_noisy=False, supplement_seed=112): self.data_path = data_path self.word2idxs = {} self.idx2word = [] self.idx2source_dataset = [] self.idx2key = [] self.data = [] self.training = training self.noise_multiplier = noise_multiplier self.noise_prob = noise_prob util.warn_or_print( logger, 'Loading {0}, train = {1}'.format(data_path, training)) if not training and parent_dataset_path is None: util.warn_or_print( logger, 'Non-training mode is selected, but parent_dataset_path is None' ) util.warn_or_print( logger, 'A non-training dataset must always have the parent specified, otherwise' 'the data mean and other derived values will be incorrect. Aborting.' ) sys.exit(-1) for i, (key, mat) in enumerate(self._raw_data_iterator()): word = key2word(key) dataset = key2dataset(key) word_example_idx = self.word2idxs.get(word, []) word_example_idx.append(i) self.word2idxs[word] = word_example_idx self.idx2word.append(word) self.idx2source_dataset.append(dataset) self.idx2key.append(key) self.data.append(mat) self.idx2word = np.array(self.idx2word) self.idx2source_dataset = np.array(self.idx2source_dataset) self.idx2key = np.array(self.idx2key) for key in self.word2idxs: self.word2idxs[key] = np.array(self.word2idxs[key]) self.counts = { key: self.word2idxs[key].shape[0] for key in self.word2idxs } if parent_dataset_path is None: self.mean = self._calculate_feature_mean() self.std = self._calculate_feature_std() else: self.load_derived_data(parent_dataset_path) if mean_subtraction: util.warn_or_print(logger, 'Applying mean subtraction') self.data = np.array([(segment - self.mean).astype(np.float32) for segment in self.data]) elif variance_normalization: util.warn_or_print(logger, 'Applying variance normalization') self.data = np.array([ ((segment - self.mean) / self.std).astype(np.float32) for segment in self.data ]) else: util.warn_or_print(logger, 'No mean subtraction') self.data = np.array( [segment.astype(np.float32) for segment in self.data]) # TODO: sort keys before doing anything else? (for identical behaviour between Kaldi and LMDB-exported datasets) self.noisy = np.zeros(len(self.data), dtype=bool) if supplement_rare_with_noisy and training: state_before = np.random.get_state() np.random.seed(supplement_seed) before_mean_examples = int( np.ceil(np.mean(list(self.counts.values())))) util.warn_or_print( logger, 'Supplementing rare classes with noisy examples up to {0} total' .format(before_mean_examples)) util.warn_or_print( logger, 'Original example count: {0}'.format(len(self.data))) for word, count in self.counts.items(): if count < before_mean_examples: to_add = before_mean_examples - count augmented_source_idx = [] augmented_examples = [] for orig_idx in islice(cycle(self.word2idxs[word]), to_add): orig_data = self.data[orig_idx] augmented_source_idx.append(orig_idx) augmented_examples.append( orig_data + noise_multiplier * np.random.normal( loc=0, scale=self.std, size=orig_data.shape)) augmented_source_idx = np.array(augmented_source_idx) augmented_examples = np.array(augmented_examples) self.data = np.concatenate((self.data, augmented_examples)) self.noisy = np.concatenate( (self.noisy, np.ones(to_add, dtype=bool))) self.word2idxs[word] = np.concatenate( (self.word2idxs[word], np.arange(len(self.data) - to_add, len(self.data), dtype=np.int32))) self.idx2word = np.concatenate( (self.idx2word, [word for _ in augmented_examples])) self.idx2key = np.concatenate( (self.idx2key, [self.idx2key[x] for x in augmented_source_idx])) self.idx2source_dataset = np.concatenate( (self.idx2source_dataset, [ self.idx2source_dataset[x] for x in augmented_source_idx ])) self.counts = { key: self.word2idxs[key].shape[0] for key in self.word2idxs } util.warn_or_print( logger, 'Augmented example count: {0}'.format(len(self.data))) np.random.set_state(state_before) self.feature_dim = self.data[0].shape[1] self.mean_examples = int(np.ceil(np.mean(list(self.counts.values())))) # siamese training setup, ignoring words with 1 example self.siamese_words = np.array( sorted([key for key in self.word2idxs if self.counts[key] > 1])) self.num_siamese_words = self.siamese_words.shape[0] # classifier training setup self.all_words = np.array(sorted(list(self.counts.keys()))) if parent_dataset_path is None: self.word2id = {key: i for i, key in enumerate(self.all_words)} else: new_words = np.array( [x not in self.word2id for x in self.all_words]) if np.any(new_words): max_given_id = max(self.word2id.values()) for i, x in enumerate(self.all_words[new_words]): self.word2id[x] = max_given_id + i + 1 self.idx2word_id = np.array( [self.word2id[word] for word in self.idx2word], dtype=np.int32)