def test_load_sphere(self): filepath = get_root_dir( ) / 'data' / 'timit' / 'train' / 'dr1' / 'mwar0' / 'sx415.wav' samples, sample_rate = load_sphere(filepath) self.assertEqual(len(samples.shape), 1) self.assertEqual(samples.shape[0], 38810) self.assertEqual(sample_rate, 16000)
def get_phone_mapping(): """ Generates: - dictionary (origin phone -> train label), to load targets for the model from transcriptions. Different phones can be mapped to the same label, as a subset of phones is used for training (48 phones). - dictionary (train label -> test label), to evaluate the model on a subset of the training phones (39 phones). - dictionary (test label -> test phone), to print the names (e.g. in confusion matrix) The training and test phone subsets are chosen according to standard recipes for TIMIT. :return: tuple (phone_labels, evaluation_mapping, test_label_to_test_phone), containing the described dictionaries. """ # read file filepath = get_root_dir() / 'data' / 'timit_phones_60-48-39.map' with filepath.open() as csv_file: data_frame = pd.read_csv(csv_file, sep='\t') data_frame = data_frame.dropna() # load phone mappings origin_to_train_phone = { op: tp for op, tp in zip(data_frame['origin'], data_frame['train']) } origin_to_test_phone = { op: tp for op, tp in zip(data_frame['origin'], data_frame['test']) } # generate labels (sorting in order to be sure that multiple calls generate always the same dictionaries) train_labels = { phone: label for label, phone in enumerate(sorted(data_frame['train'].unique())) } test_labels = { phone: label for label, phone in enumerate(sorted(data_frame['test'].unique())) } # get phone labels (origin phone -> train label, to generate targets from transcriptions) origin_phone_to_train_label = {} for origin_phone in data_frame['origin']: train_phone = origin_to_train_phone[origin_phone] origin_phone_to_train_label[origin_phone] = train_labels[train_phone] # get evaluation mapping (train label -> test label, to evaluate the model using a subset of phones) train_label_to_test_label = {} for origin_phone in data_frame['origin']: test_phone = origin_to_test_phone[origin_phone] train_label = origin_phone_to_train_label[origin_phone] train_label_to_test_label[train_label] = test_labels[test_phone] # get test class names (for confusion matrix) test_label_to_test_phone = { value: key for key, value in test_labels.items() } return origin_phone_to_train_label, train_label_to_test_label, test_label_to_test_phone
def get_core_test_speakers(): """ Returns a dictionary (dialect -> list of speaker_id) for the core test set. :return: dictionary (dialect -> list of speaker_id) """ filepath = get_root_dir() / 'data' / 'timit_core_test.json' with filepath.open() as json_file: return json.load(json_file)
def test_extract_features(self): filepath = get_root_dir( ) / 'data' / 'timit' / 'train' / 'dr1' / 'mwar0' / 'sx415.wav' win_len = 0.03 win_shift = 0.01 samples, sample_rate = load_sphere(filepath) features = extract_features(samples, sample_rate, win_len, win_shift) n_frames = 1 + round((samples.shape[0] - win_len * sample_rate) / (win_shift * sample_rate)) self.assertTrue(features.shape[0] - n_frames <= 1)
def load_data(dataset_path, core_test=True, force_preprocess=False): """ Returns training and test set containing features (13 MFCC + delta + delta-delta) and labels (phones encoded as integers). The split in training and test sets is the recommended one (see timit/readme.doc and timit/doc/testset.doc). :param dataset_path: path to the dataset. Since the TIMIT dataset is protected by copyright, it is not distributed with the package. :param core_test: whether to use the core test set (see timit/doc/testset.doc) instead of the complete test set :param force_preprocess: force to pre-process again, even if saved data can be loaded :return: tuple (train_set, test_set), where train_set and test_set are numpy arrays of utterances. Each utterance is a dictionary containing utterance info useful for normalization, feature vectors, and phone labels. """ dataset_path = Path(dataset_path) if not dataset_path.is_dir(): raise ValueError('Invalid dataset path') # training set filepath = get_root_dir() / 'data' / 'timit_train.npz' if filepath.is_file() and not force_preprocess: print('Loading training set...', end=' ') train_set = np.load(filepath, allow_pickle=True)['train_set'] print('done') else: train_set = _preprocess_data(dataset_path / 'train') np.savez(filepath, train_set=train_set) # test set filepath = get_root_dir() / 'data' / ('timit_' + ('core_' if core_test else '') + 'test.npz') if filepath.is_file() and not force_preprocess: print('Loading test set...', end=' ') test_set = np.load(filepath, allow_pickle=True)['test_set'] print('done') else: test_set = _preprocess_data(dataset_path / 'test', core_test) np.savez(filepath, test_set=test_set) return train_set, test_set
def test_load_transcription(self): filepath = get_root_dir( ) / 'data' / 'timit' / 'train' / 'dr1' / 'fcjf0' / 'sa1.phn' transcription = load_transcription(filepath) self.assertTupleEqual(transcription[0], (0, 3050, 'h#')) self.assertTupleEqual(transcription[5], (8772, 9190, 'dcl')) self.assertTupleEqual(transcription[10], (12640, 14714, 'ah')) self.assertTupleEqual(transcription[15], (20417, 21199, 'q')) self.assertTupleEqual(transcription[20], (24229, 25566, 'ix')) self.assertTupleEqual(transcription[25], (31719, 33360, 'sh')) self.assertTupleEqual(transcription[30], (36326, 37556, 'axr')) self.assertTupleEqual(transcription[36], (44586, 46720, 'h#'))
def test_normalize(self): dataset_path = get_root_dir() / 'data' / 'timit' train_set, _ = timit.load_data(dataset_path) # test normalization on whole dataset normalized_train_set, _ = normalize(train_set, mode='full') x_train = np.concatenate( [utterance['features'] for utterance in normalized_train_set]) mean = x_train.mean(axis=0) var = x_train.var(axis=0) for i in range(x_train.shape[1]): self.assertAlmostEqual(mean[i], 0) self.assertAlmostEqual(var[i], 1)
def test_extract_labels(self): filepath = get_root_dir( ) / 'data' / 'timit' / 'train' / 'dr1' / 'mwar0' / 'sx415.wav' _, sample_rate = load_sphere(filepath) filepath = filepath.with_suffix('.phn') transcription = load_transcription(filepath) win_len = 0.03 win_shift = 0.01 n_frames = get_number_of_frames(38720, sample_rate, win_len, win_shift) labels = extract_labels(transcription, sample_rate, n_frames, win_len, win_shift) self.assertEqual(len(set(labels)), 25) self.assertEqual(len(labels), 240)
def test_unlabel(self): dataset_path = get_root_dir() / 'data' / 'timit' train_set, _ = timit.load_data(dataset_path) n_total = len(train_set) unlabel(train_set, 0.7, seed=1) n_labeled = len( [utterance for utterance in train_set if 'labels' in utterance]) n_unlabeled = len([ utterance for utterance in train_set if 'labels' not in utterance ]) self.assertEqual(n_labeled + n_unlabeled, n_total) self.assertTrue(n_labeled < n_unlabeled) self.assertEqual(n_labeled, 1104) self.assertEqual(n_unlabeled, 2592)
def _split_validation_unique(train_set): doc_path = get_root_dir() / 'data' / 'spkrinfo_spkrsent.txt' # shuffle data with open(doc_path, 'r') as source: lines = [line for line in source] lines[-1] += '\n' lines = np.array(lines) np.random.shuffle(lines) drs = [[2, 1], [2, 1], [2, 1], [2, 1], [2, 1], [2, 1], [2, 1], [1, 1]] # [male, female] for each dialect new_speakers = [] # unique speakers sentence_ids = [] # unique speakers sentences for line in lines: columns = line.split() condition = (not columns[4] in sentence_ids) and ( not columns[5] in sentence_ids) and (not columns[6] in sentence_ids) and ( not columns[7] in sentence_ids) and (not columns[8] in sentence_ids) if columns[ 3] == 'TRN' and condition: # never seen a speaker saying this sentence if drs[int(columns[2]) - 1][0] != 0 and columns[1] == 'M': # if males is not filled sentence_ids.extend([ columns[4], columns[5], columns[6], columns[7], columns[8], columns[9], columns[10], columns[11] ]) new_speakers.append(columns[0]) drs[int(columns[2]) - 1][0] -= 1 elif (drs[int(columns[2]) - 1][1] != 0) and columns[1] == 'F': sentence_ids.extend([ columns[4], columns[5], columns[6], columns[7], columns[8], columns[9], columns[10], columns[11] ]) new_speakers.append(columns[0]) drs[int(columns[2]) - 1][1] -= 1 pair_speakers = [] # unique speakers pairs (for the complete_valid_set) pair_sentence_ids = [ ] # unique speakers pairs sentences (for the complete_valid_set) for line in lines: columns = line.split() condition = (not columns[4] in sentence_ids) and ( not columns[5] in sentence_ids) and (not columns[6] in sentence_ids) and ( not columns[7] in sentence_ids) and (not columns[8] in sentence_ids) if columns[3] == 'TRN' and not condition: pair_speakers.append(columns[0]) pair_sentence_ids.extend([ columns[4], columns[5], columns[6], columns[7], columns[8], columns[9], columns[10], columns[11] ]) valid = [] train = [] complete_valid = [] new_speakers = [x.lower() for x in new_speakers] pair_speakers = [x.lower() for x in pair_speakers] for utterance in train_set: if utterance['speaker_id'] in new_speakers: valid.append(utterance) if utterance['speaker_id'] in pair_speakers: complete_valid.append(utterance) if (not utterance['speaker_id'] in pair_speakers) and ( not utterance['speaker_id'] in new_speakers): train.append(utterance) train = np.asarray(train) valid = np.asarray(valid) complete_valid = np.asarray(complete_valid) return train, valid, complete_valid