def main(): args = parser.parse_args() htk_save_path = mkdir(args.htk_save_path) path = Path(data_path=args.data_path, config_path=args.config_path) # HTK settings save_config(audio_file_type='nist', feature_type=args.feature_type, channels=args.channels, config_save_path='./config', sampling_rate=16000, window=args.window, slide=args.slide, energy=bool(args.energy), delta=bool(args.delta), deltadelta=bool(args.deltadelta)) # NOTE: 123-dim features are extracted by default for data_type in ['train', 'dev', 'test']: wav_paths = path.wav(data_type=data_type) save_path = mkdir_join(htk_save_path, data_type) with open('./config/wav2htk_' + data_type + '.scp', 'w') as f: for wav_path in wav_paths: speaker = wav_path.split('/')[-2] utt_index = basename(wav_path).split('.')[0] save_path_tmp = mkdir_join( save_path, speaker, utt_index + '.htk') f.write(wav_path + ' ' + save_path_tmp + '\n')
def main(): args = parser.parse_args() htk_save_path = mkdir(args.htk_save_path) # HTK settings save_config(audio_file_type='wav', feature_type=args.feature_type, channels=args.channels, config_save_path='./config', sampling_rate=8000, window=args.window, slide=args.slide, energy=bool(args.energy), delta=bool(args.delta), deltadelta=bool(args.deltadelta)) # NOTE: 120-dim features are extracted by default # Switchboard with open('./config/wav2htk_swbd.scp', 'w') as f: for wav_path in glob(join(args.wav_save_path, 'swbd/*.wav')): # ex.) wav_path: wav/swbd/*.wav save_path = mkdir_join(htk_save_path, 'swbd', basename(wav_path).split('.')[0] + '.htk') f.write(wav_path + ' ' + save_path + '\n') # ex.) htk_path: wav/swbd/*.htk # eval2000 (swbd) with open('./config/wav2htk_eval2000_swbd.scp', 'w') as f: for wav_path in glob(join(args.wav_save_path, 'eval2000/swbd/*.wav')): # ex.) wav_path: wav/eval2000_swbd/*.wav save_path = mkdir_join(htk_save_path, 'eval2000', 'swbd', basename(wav_path).split('.')[0] + '.htk') f.write(wav_path + ' ' + save_path + '\n') # ex.) htk_path: wav/eval2000/swbd/*.htk # eval2000 (callhome) with open('./config/wav2htk_eval2000_ch.scp', 'w') as f: for wav_path in glob( join(args.wav_save_path, 'eval2000/callhome/*.wav')): # ex.) wav_path: wav/eval2000_ch/*.wav save_path = mkdir_join(htk_save_path, 'eval2000', 'callhome', basename(wav_path).split('.')[0] + '.htk') f.write(wav_path + ' ' + save_path + '\n') # ex.) htk_path: wav/eval2000/callhome/*.htk # Fisher if bool(args.fisher): with open('./config/wav2htk_fisher.scp', 'w') as f: for wav_path in glob(join(args.wav_save_path, 'fisher/*/*.wav')): # ex.) wav_path: wav/fisher/speaker/*.wav speaker = wav_path.split('/')[-2] save_path = mkdir_join( htk_save_path, 'fisher', speaker, basename(wav_path).split('.')[0] + '.htk') f.write(wav_path + ' ' + save_path + '\n')
def split_wav(wav_paths, save_path, speaker_dict): """Read WAV files & divide them with respect to each utterance. Args: wav_paths (list): path to WAV files save_path (string): path to save WAV files speaker_dict (dict): the dictionary of utterances of each speaker key => speaker value => the dictionary of utterance information of each speaker key => utterance index value => [start_frame, end_frame, transcript] """ # Read each WAV file print('==> Reading WAV files...') print(speaker_dict.keys()) for wav_path in tqdm(wav_paths): speaker = basename(wav_path).split('.')[0] # NOTE: For Switchboard speaker = speaker.replace('sw0', 'sw') speaker = speaker.replace('sw_', 'sw') speaker = speaker.replace('en_', 'en') utt_dict = speaker_dict[speaker] wav_utt_save_path = mkdir_join(save_path, speaker) # Read a wav file audio = Audio(file_path=wav_path) audio_data = audio.read() # Split per utterance & save as wav files audio.split(audio_data, utt_dict, speaker, save_path=wav_utt_save_path)
def test(self): speaker_dict_a, char_set_a, char_capital_set_a, word_count_dict_a = read_trans_fisher( label_paths=label_paths_fisher, target_speaker='A') speaker_dict_b, char_set_b, char_capital_set_b, word_count_dict_b = read_trans_fisher( label_paths=label_paths_fisher, target_speaker='B') # Meage 2 dictionaries speaker_dict_fisher = merge_dicts([speaker_dict_a, speaker_dict_b]) char_set = char_set_a | char_set_b char_capital_set = char_capital_set_a | char_capital_set_b word_count_dict_fisher = dict( Counter(word_count_dict_a) + Counter(word_count_dict_b)) self.speaker_dict = read_trans_swbd( label_paths=label_paths_swbd, run_root_path='../', vocab_file_save_path=mkdir_join('../config/vocab_files'), save_vocab_file=True, speaker_dict_fisher=speaker_dict_fisher, char_set=char_set, char_capital_set=char_capital_set, word_count_dict=word_count_dict_fisher) self.check(normalize='global', tool='htk') self.check(normalize='speaker', tool='htk') self.check(normalize='utterance', tool='htk')
def main(): args = parser.parse_args() htk_save_path = mkdir(args.htk_save_path) path = Path(data_path=args.data_path, config_path='./config') # HTK settings save_config(audio_file_type='wav', feature_type=args.feature_type, channels=args.channels, config_save_path='./config', sampling_rate=16000, window=args.window, slide=args.slide, energy=bool(args.energy), delta=bool(args.delta), deltadelta=bool(args.deltadelta)) data_types = ['eval1', 'eval2', 'eval3'] if bool(args.subset): data_types += ['train_subset'] if bool(args.fullset): data_types += ['train_fullset'] for data_type in data_types: wav_paths = path.wav(data_type=data_type) save_path = mkdir_join(htk_save_path, data_type) with open('./config/wav2htk_' + data_type + '.scp', 'w') as f: for wav_path in wav_paths: speaker = basename(wav_path).split('.')[0] save_path_tmp = join(save_path, speaker + '.htk') f.write(wav_path + ' ' + save_path_tmp + '\n')
def check(self): read_trans( label_paths=label_paths, word_boundary_paths=wb_paths, run_root_path='../', vocab_file_save_path=mkdir_join('../config/vocab_files'), save_vocab_file=True)
def test(self): self.speaker_dict = read_trans( label_paths=label_paths, word_boundary_paths=wb_paths, run_root_path='../', vocab_file_save_path=mkdir_join('../config/vocab_files'), save_vocab_file=False) self.check(normalize='global', tool='htk') self.check(normalize='speaker', tool='htk') self.check(normalize='utterance', tool='htk')
def check(self): for data_type in ['train', 'dev', 'test']: save_vocab_file = True if data_type == 'train' else False is_test = True if data_type == 'test' else False print('---------- %s ----------' % data_type) trans_dict = read_phone( label_paths=label_paths[data_type], vocab_file_save_path=mkdir_join('../config/vocab_files'), save_vocab_file=save_vocab_file, is_test=is_test) print(trans_dict)
def posterior_test(session, posteriors_op, network, dataset, label_type, rate=1.0): """Visualize label posteriors. Args: session: session of training model posteriois_op: operation for computing posteriors network: network to evaluate dataset: Dataset class label_type: phone39 or phone48 or phone61 or character rate: rate of evaluation data to use """ save_path = mkdir_join(network.model_dir, 'ctc_output') batch_size = 1 num_examples = dataset.data_num * rate iteration = int(num_examples / batch_size) if (num_examples / batch_size) != int(num_examples / batch_size): iteration += 1 for step in range(iteration): # Create feed dictionary for next mini batch inputs, _, seq_len, input_names = dataset.next_batch( batch_size=batch_size) feed_dict = { network.inputs_pl: inputs, network.seq_len_pl: seq_len, network.keep_prob_input_pl: 1.0, network.keep_prob_hidden_pl: 1.0 } # Visualize batch_size_each = len(seq_len) max_frame_num = inputs.shape[1] posteriors = session.run(posteriors_op, feed_dict=feed_dict) for i_batch in range(batch_size_each): posteriors_index = np.array([i_batch + (batch_size_each * j) for j in range(max_frame_num)]) if label_type != 'character': probs.plot_probs_ctc_phone(probs=posteriors[posteriors_index][:int(seq_len[i_batch]), :], save_path=save_path, wav_index=input_names[i_batch], data_type=dataset.data_type, label_type=label_type)
def check(self, data_size): print('=' * 50) print(' data_size: %s' % str(data_size)) print('=' * 50) for data_type in ['train', 'dev_clean', 'dev_other', 'test_clean', 'test_other']: if data_type == 'train': label_paths = path.trans(data_type='train_' + data_size) else: label_paths = path.trans(data_type=data_type) save_vocab_file = True if data_type == 'train' is_test = True if 'test' in data_type else False print('---------- %s ----------' % data_type) read_trans( label_paths=label_paths, data_size=data_size, vocab_file_save_path=mkdir_join('../config/vocab_files'), is_test=is_test, data_type=data_type)
def check(self, data_size): print('=' * 50) print(' data_size: %s' % str(data_size)) print('=' * 50) for data_type in ['dev', 'eval1', 'eval2', 'eval3']: if data_type == 'train': label_paths = path.trans(data_type='train_' + data_size) else: label_paths = path.trans(data_type=data_type) save_vocab_file = True if data_type == 'train' is_test = True if 'eval' in data_type else False print('---------- %s ----------' % data_type) read_sdb( label_paths=label_paths, data_size=data_size, vocab_file_save_path=mkdir_join('../config', 'vocab_files'), is_test=is_test, data_type=data_type)
def main(): args = parser.parse_args() htk_save_path = mkdir(args.htk_save_path) # HTK settings save_config(audio_file_type='wav', feature_type=args.feature_type, channels=args.channels, config_save_path='./config', sampling_rate=16000, window=args.window, slide=args.slide, energy=bool(args.energy), delta=bool(args.delta), deltadelta=bool(args.deltadelta)) # NOTE: 120-dim features are extracted by default parts = [ 'train-clean-100', 'dev-clean', 'dev-other', 'test-clean', 'test-other' ] if bool(args.large): parts += ['train-clean-360', 'train-other-500'] elif bool(args.medium): parts += ['train-clean-360'] for part in parts: # part/speaker/book/*.wav wav_paths = [p for p in glob(join(args.data_path, part, '*/*/*.wav'))] with open('./config/wav2htk_' + part + '.scp', 'w') as f: for wav_path in wav_paths: # ex.) wav_path: speaker/book/speaker-book-utt_index.wav speaker, book, utt_index = basename(wav_path).split( '.')[0].split('-') save_path = mkdir_join( htk_save_path, part, speaker, book, basename(wav_path).split('.')[0] + '.htk') f.write(wav_path + ' ' + save_path + '\n')
def split_wav(wav_paths, save_path, speaker_dict): """Read WAV files & divide them with respect to each utterance. Args: wav_paths (list): path to WAV files save_path (string): path to save WAV files speaker_dict (dict): the dictionary of utterances of each speaker key => speaker value => the dictionary of utterance information of each speaker key => utterance index value => [start_frame, end_frame, transcript] """ # Read each WAV file print('==> Reading WAV files...') for wav_path in tqdm(wav_paths): speaker = basename(wav_path).split('.')[0] # NOTE: For Switchboard speaker = speaker.replace('sw0', 'sw') speaker = speaker.replace('sw_', 'sw') speaker = speaker.replace('en_', 'en') if 'subject' in speaker: speaker = '_'.join(speaker.split('_')[:2]) + '_U' elif 'operator' in speaker: speaker = '_'.join(speaker.split('_')[:2]) + '_S' utt_dict = speaker_dict[speaker] wav_utt_save_path = mkdir_join(save_path, speaker) # Read a wav file audio = Audio(file_path=wav_path) audio_data = audio.read() # Split per utterance & save as wav files audio.split(audio_data, utt_dict, speaker, save_path=wav_utt_save_path) # Save the frame number dictionary with open(join(save_path, 'frame_num.pickle'), 'wb') as f: pickle.dump(audio.frame_num_dict, f)
def main(data_size): speaker_dict_dict = {} # dict of speaker_dict for data_type in ['train', 'eval1', 'eval2', 'eval3']: print('=' * 50) print(' ' * 20 + data_type + ' (' + data_size + ')' + ' ' * 20) print('=' * 50) ######################################## # labels ######################################## if data_type == 'train': label_paths = path.trans(data_type='train_' + data_size) else: label_paths = path.trans(data_type=data_type) save_vocab_file = True if data_type == 'train' else False is_test = True if 'eval' in data_type else False print('=> Processing transcripts...') speaker_dict_dict[data_type] = read_sdb( label_paths=label_paths, data_size=data_size, vocab_file_save_path=mkdir_join('./config', 'vocab_files'), save_vocab_file=save_vocab_file, is_test=is_test, data_type=data_type) ######################################## # inputs ######################################## print('\n=> Processing input data...') input_save_path = mkdir_join(args.feature_save_path, args.save_format, data_size) if isfile(join(input_save_path, data_type, 'complete.txt')): print('Already exists.') else: if args.save_format == 'wav': ######################################## # Split WAV files per utterance ######################################## if data_type == 'train': wav_paths = path.wav(corpus='train' + data_size) else: wav_paths = path.wav(corpus=data_type) split_wav(wav_paths=wav_paths, speaker_dict=speaker_dict_dict[data_type], save_path=mkdir_join(input_save_path, data_type)) # NOTE: ex.) save_path: # csj/feature/save_format/data_size/data_type/speaker/utt_name.npy elif args.save_format in ['numpy', 'htk']: if data_type == 'train': if args.tool == 'htk': audio_paths = path.htk(data_type='train_' + data_size) else: audio_paths = path.wav(data_type='train_' + data_size) is_training = True global_mean_male, global_std_male, global_mean_female, global_std_female = None, None, None, None else: if args.tool == 'htk': audio_paths = path.htk(data_type=data_type) else: audio_paths = path.wav(data_type=data_type) is_training = False # Load statistics over train dataset global_mean_male = np.load( join(input_save_path, 'train/global_mean_male.npy')) global_std_male = np.load( join(input_save_path, 'train/global_std_male.npy')) global_mean_female = np.load( join(input_save_path, 'train/global_mean_female.npy')) global_std_female = np.load( join(input_save_path, 'train/global_std_female.npy')) read_audio(audio_paths=audio_paths, speaker_dict=speaker_dict_dict[data_type], tool=args.tool, config=CONFIG, normalize=args.normalize, is_training=is_training, save_path=mkdir_join(input_save_path, data_type), save_format=args.save_format, global_mean_male=global_mean_male, global_std_male=global_std_male, global_mean_female=global_mean_female, global_std_female=global_std_female) # NOTE: ex.) save_path: # csj/feature/save_format/data_size/data_type/speaker/*.npy # Make a confirmation file to prove that dataset was saved # correctly with open(join(input_save_path, data_type, 'complete.txt'), 'w') as f: f.write('') ######################################## # dataset (csv) ######################################## print('\n=> Saving dataset files...') dataset_save_path = mkdir_join(args.dataset_save_path, args.save_format, data_size, data_type) df_columns = ['frame_num', 'input_path', 'transcript'] df_kanji = pd.DataFrame([], columns=df_columns) df_kanji_divide = pd.DataFrame([], columns=df_columns) df_kana = pd.DataFrame([], columns=df_columns) df_kana_divide = pd.DataFrame([], columns=df_columns) df_phone = pd.DataFrame([], columns=df_columns) df_phone_divide = pd.DataFrame([], columns=df_columns) df_word_freq1 = pd.DataFrame([], columns=df_columns) df_word_freq5 = pd.DataFrame([], columns=df_columns) df_word_freq10 = pd.DataFrame([], columns=df_columns) df_word_freq15 = pd.DataFrame([], columns=df_columns) with open(join(input_save_path, data_type, 'frame_num.pickle'), 'rb') as f: frame_num_dict = pickle.load(f) utt_count = 0 df_kanji_list, df_kanji_divide_list = [], [] df_kana_list, df_kana_divide_list = [], [] df_phone_list, df_phone_divide_list = [], [] df_word_freq1_list, df_word_freq5_list = [], [] df_word_freq10_list, df_word_freq15_list = [], [] speaker_dict = speaker_dict_dict[data_type] for speaker, utt_dict in tqdm(speaker_dict.items()): for utt_index, utt_info in utt_dict.items(): kanji_indices, kanji_divide_indices = utt_info[2:4] kana_indices, kana_divide_indices = utt_info[4:6] phone_indices, phone_divide_indices = utt_info[6:8] word_freq1_indices, word_freq5_indices = utt_info[8:10] word_freq10_indices, word_freq15_indices = utt_info[10:12] if args.save_format == 'numpy': input_utt_save_path = join( input_save_path, data_type, speaker, speaker + '_' + utt_index + '.npy') elif args.save_format == 'htk': input_utt_save_path = join( input_save_path, data_type, speaker, speaker + '_' + utt_index + '.htk') elif args.save_format == 'wav': input_utt_save_path = path.utt2wav(utt_index) else: raise ValueError('save_format is numpy or htk or wav.') assert isfile(input_utt_save_path) frame_num = frame_num_dict[speaker + '_' + utt_index] df_kanji = add_element( df_kanji, [frame_num, input_utt_save_path, kanji_indices]) df_kanji_divide = add_element( df_kanji_divide, [frame_num, input_utt_save_path, kanji_divide_indices]) df_kana = add_element( df_kana, [frame_num, input_utt_save_path, kana_indices]) df_kana_divide = add_element( df_kana_divide, [frame_num, input_utt_save_path, kana_divide_indices]) df_phone = add_element( df_phone, [frame_num, input_utt_save_path, phone_indices]) df_phone_divide = add_element( df_phone_divide, [frame_num, input_utt_save_path, phone_divide_indices]) df_word_freq1 = add_element( df_word_freq1, [frame_num, input_utt_save_path, word_freq1_indices]) df_word_freq5 = add_element( df_word_freq5, [frame_num, input_utt_save_path, word_freq5_indices]) df_word_freq10 = add_element( df_word_freq10, [frame_num, input_utt_save_path, word_freq10_indices]) df_word_freq15 = add_element( df_word_freq15, [frame_num, input_utt_save_path, word_freq15_indices]) utt_count += 1 # Reset if utt_count == 10000: df_kanji_list.append(df_kanji) df_kanji_divide_list.append(df_kanji_divide) df_kana_list.append(df_kana) df_kana_divide_list.append(df_kana_divide) df_phone_list.append(df_phone) df_phone_divide_list.append(df_phone_divide) df_word_freq1_list.append(df_word_freq1) df_word_freq5_list.append(df_word_freq5) df_word_freq10_list.append(df_word_freq10) df_word_freq15_list.append(df_word_freq15) df_kanji = pd.DataFrame([], columns=df_columns) df_kanji_divide = pd.DataFrame([], columns=df_columns) df_kana = pd.DataFrame([], columns=df_columns) df_kana_divide = pd.DataFrame([], columns=df_columns) df_phone = pd.DataFrame([], columns=df_columns) df_phone_divide = pd.DataFrame([], columns=df_columns) df_word_freq1 = pd.DataFrame([], columns=df_columns) df_word_freq5 = pd.DataFrame([], columns=df_columns) df_word_freq10 = pd.DataFrame([], columns=df_columns) df_word_freq15 = pd.DataFrame([], columns=df_columns) utt_count = 0 # Last dataframe df_kanji_list.append(df_kanji) df_kanji_divide_list.append(df_kanji_divide) df_kana_list.append(df_kana) df_kana_divide_list.append(df_kana_divide) df_phone_list.append(df_phone) df_phone_divide_list.append(df_phone_divide) df_word_freq1_list.append(df_word_freq1) df_word_freq5_list.append(df_word_freq5) df_word_freq10_list.append(df_word_freq10) df_word_freq15_list.append(df_word_freq15) # Concatenate all dataframes df_kanji = df_kanji_list[0] df_kanji_divide = df_kanji_divide_list[0] df_kana = df_kana_list[0] df_kana_divide = df_kana_divide_list[0] df_phone = df_phone_list[0] df_phone_divide = df_phone_divide_list[0] df_word_freq1 = df_word_freq1_list[0] df_word_freq5 = df_word_freq5_list[0] df_word_freq10 = df_word_freq10_list[0] df_word_freq15 = df_word_freq15_list[0] for df_i in df_kanji_list[1:]: df_kanji = pd.concat([df_kanji, df_i], axis=0) for df_i in df_kanji_divide_list[1:]: df_kanji_divide = pd.concat([df_kanji_divide, df_i], axis=0) for df_i in df_kana_list[1:]: df_kana = pd.concat([df_kana, df_i], axis=0) for df_i in df_kana_divide_list[1:]: df_kana_divide = pd.concat([df_kana_divide, df_i], axis=0) for df_i in df_phone_list[1:]: df_phone = pd.concat([df_phone, df_i], axis=0) for df_i in df_phone_divide_list[1:]: df_phone_divide = pd.concat([df_phone_divide, df_i], axis=0) for df_i in df_word_freq1_list[1:]: df_word_freq1 = pd.concat([df_word_freq1, df_i], axis=0) for df_i in df_word_freq5_list[1:]: df_word_freq5 = pd.concat([df_word_freq5, df_i], axis=0) for df_i in df_word_freq10_list[1:]: df_word_freq10 = pd.concat([df_word_freq10, df_i], axis=0) for df_i in df_word_freq15_list[1:]: df_word_freq15 = pd.concat([df_word_freq15, df_i], axis=0) df_kanji.to_csv(join(dataset_save_path, 'kanji.csv')) df_kanji_divide.to_csv(join(dataset_save_path, 'kanji_divide.csv')) df_kana.to_csv(join(dataset_save_path, 'kana.csv')) df_kana_divide.to_csv(join(dataset_save_path, 'kana_divide.csv')) df_phone.to_csv(join(dataset_save_path, 'phone.csv')) df_phone_divide.to_csv(join(dataset_save_path, 'phone_divide.csv')) df_word_freq1.to_csv(join(dataset_save_path, 'word_freq1.csv')) df_word_freq5.to_csv(join(dataset_save_path, 'word_freq5.csv')) df_word_freq10.to_csv(join(dataset_save_path, 'word_freq10.csv')) df_word_freq15.to_csv(join(dataset_save_path, 'word_freq15.csv'))
def main(): for data_type in ['train', 'dev', 'test']: print('=' * 50) print(' ' * 20 + data_type + ' ' * 20) print('=' * 50) ######################################## # inputs ######################################## print('=> Processing input data...') if args.save_format in ['numpy', 'htk']: input_save_path = mkdir_join(args.feature_save_path, args.save_format) if isfile(join(input_save_path, data_type, 'complete.txt')): print('Already exists.') else: if args.tool == 'htk': audio_paths = path.htk(data_type=data_type) else: audio_paths = path.wav(data_type=data_type) if data_type != 'train': is_training = False # Load statistics over train dataset global_mean_male = np.load( join(input_save_path, 'train/global_mean_male.npy')) global_std_male = np.load( join(input_save_path, 'train/global_std_male.npy')) global_mean_female = np.load( join(input_save_path, 'train/global_mean_female.npy')) global_std_female = np.load( join(input_save_path, 'train/global_std_female.npy')) else: is_training = True global_mean_male, global_std_male, global_mean_female, global_std_female = None, None, None, None # Read htk or wav files, and save input data and frame num dict read_audio(audio_paths=audio_paths, tool=args.tool, config=CONFIG, normalize=args.normalize, is_training=is_training, save_path=mkdir_join(input_save_path, data_type), save_format=args.save_format, global_mean_male=global_mean_male, global_std_male=global_std_male, global_mean_female=global_mean_female, global_std_female=global_std_female) # NOTE: ex.) save_path: # timit/feature/save_format/data_type/*.npy # Make a confirmation file to prove that dataset was saved # correctly with open(join(input_save_path, data_type, 'complete.txt'), 'w') as f: f.write('') ######################################## # labels (character) ######################################## print('\n=> Processing transcripts (char)...') save_vocab_file = True if data_type == 'train' else False is_test = True if data_type == 'test' else False trans_dict = read_char(label_paths=path.trans(data_type=data_type), vocab_file_save_path=mkdir_join( './config', 'vocab_files'), save_vocab_file=save_vocab_file, is_test=is_test) ######################################## # dataset (character, csv) ######################################## print('\n=> Saving dataset files (char)...') dataset_save_path = mkdir_join(args.dataset_save_path, args.save_format, data_type) df_char = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_char_capital = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) for utt_name, [char_indices, char_indices_capital] in tqdm(trans_dict.items()): if args.save_format == 'numpy': speaker = utt_name.split('_')[0] input_utt_save_path = join(input_save_path, data_type, speaker, utt_name + '.npy') assert isfile(input_utt_save_path) input_utt = np.load(input_utt_save_path) elif args.save_format == 'htk': speaker = utt_name.split('_')[0] input_utt_save_path = join(input_save_path, data_type, speaker, utt_name + '.htk') assert isfile(input_utt_save_path) input_utt, _, _ = read(input_utt_save_path) elif args.save_format == 'wav': input_utt_save_path = path.utt2wav(utt_name) assert isfile(input_utt_save_path) input_utt = w2f_psf(input_utt_save_path, feature_type=CONFIG['feature_type'], feature_dim=CONFIG['channels'], use_energy=CONFIG['energy'], use_delta1=CONFIG['delta'], use_delta2=CONFIG['deltadelta'], window=CONFIG['window'], slide=CONFIG['slide']) else: raise ValueError('save_format is numpy or htk or wav.') frame_num = input_utt.shape[0] del input_utt series_char = pd.Series( [frame_num, input_utt_save_path, char_indices], index=df_char.columns) series_char_capital = pd.Series( [frame_num, input_utt_save_path, char_indices_capital], index=df_char_capital.columns) df_char = df_char.append(series_char, ignore_index=True) df_char_capital = df_char_capital.append(series_char_capital, ignore_index=True) df_char.to_csv(join(dataset_save_path, 'character.csv')) df_char_capital.to_csv( join(dataset_save_path, 'character_capital_divide.csv')) ######################################## # labels (phone) ######################################## print('\n=> Processing transcripts (phone)...') trans_dict = read_phone(label_paths=path.phone(data_type=data_type), vocab_file_save_path=mkdir_join( './config', 'vocab_files'), save_vocab_file=save_vocab_file, is_test=is_test) ######################################## # dataset (phone, csv) ######################################## print('\n=> Saving dataset files (phone)...') df_phone61 = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_phone48 = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_phone39 = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) for utt_name, [phone61_indices, phone48_indices, phone39_indices] in tqdm(trans_dict.items()): if args.save_format == 'numpy': speaker = utt_name.split('_')[0] input_utt_save_path = join(input_save_path, data_type, speaker, utt_name + '.npy') assert isfile(input_utt_save_path) input_utt = np.load(input_utt_save_path) elif args.save_format == 'htk': speaker = utt_name.split('_')[0] input_utt_save_path = join(input_save_path, data_type, speaker, utt_name + '.htk') assert isfile(input_utt_save_path) input_utt, _, _ = read(input_utt_save_path) elif args.save_format == 'wav': input_utt_save_path = path.utt2wav(utt_name) assert isfile(input_utt_save_path) input_utt = w2f_psf(input_utt_save_path, feature_type=CONFIG['feature_type'], feature_dim=CONFIG['channels'], use_energy=CONFIG['energy'], use_delta1=CONFIG['delta'], use_delta2=CONFIG['deltadelta'], window=CONFIG['window'], slide=CONFIG['slide']) else: raise ValueError('save_format is numpy or htk or wav.') frame_num = input_utt.shape[0] del input_utt series_phone61 = pd.Series( [frame_num, input_utt_save_path, phone61_indices], index=df_phone61.columns) series_phone48 = pd.Series( [frame_num, input_utt_save_path, phone48_indices], index=df_phone48.columns) series_phone39 = pd.Series( [frame_num, input_utt_save_path, phone39_indices], index=df_phone39.columns) df_phone61 = df_phone61.append(series_phone61, ignore_index=True) df_phone48 = df_phone48.append(series_phone48, ignore_index=True) df_phone39 = df_phone39.append(series_phone39, ignore_index=True) df_phone61.to_csv(join(dataset_save_path, 'phone61.csv')) df_phone48.to_csv(join(dataset_save_path, 'phone48.csv')) df_phone39.to_csv(join(dataset_save_path, 'phone39.csv'))
def read_trans(label_paths, word_boundary_paths, run_root_path, vocab_file_save_path, save_vocab_file=False, speaker_dict_fisher=None, char_set=None, char_capital_set=None, word_count_dict=None): """Read transcripts (*_trans.txt) & save files (.npy). Args: label_paths (list): list of paths to label files word_boundary_paths (list): list of paths to word boundary files run_root_path (string): vocab_file_save_path (string): path to vocabulary files save_vocab_file (bool, optional): if True, save vocabulary files speaker_dict_fisher (dict): char_set (set): char_capital_set (set): word_count_dict (dict): Returns: speaker_dict: dictionary of speakers key (string) => speaker value (dict) => dictionary of utterance infomation of each speaker key (string) => utterance index value (list) => [start_frame, end_frame, char_indices, char_indices_capital, word_freq1_indices, word_freq5_indices, word_freq10_indices, word_freq15_indices] """ print('=====> Processing target labels...') merge_with_fisher = True if speaker_dict_fisher is not None else False if merge_with_fisher: speaker_dict = speaker_dict_fisher vocab_set = set([]) for word in word_count_dict.keys(): vocab_set.add(word) else: speaker_dict = OrderedDict() char_set, char_capital_set = set([]), set([]) word_count_dict = {} vocab_set = set([]) for label_path, wb_path in zip(tqdm(label_paths), word_boundary_paths): assert label_path == wb_path.replace('word', 'trans') utterance_dict = OrderedDict() segmentation_dict = read_segmentation(wb_path) with open(label_path, 'r') as f: for line in f: line = line.strip().lower().split(' ') speaker = line[0].split('-')[0] # Fix speaker name speaker = speaker.replace('sw0', 'sw').replace('a', '-A').replace( 'b', '-B') utt_index = line[0].split('-')[-1] start_frame = int(float(line[1]) * 100 + 0.05) end_frame = int(float(line[2]) * 100 + 0.05) transcript = ' '.join(line[3:]) if transcript == '[silence]': continue # Divide into short utterances length_threshold = 700 if end_frame - start_frame >= length_threshold: word_info_list = segmentation_dict[utt_index] divide_points = [] divided_trans = [] partial_word_list = [] start_frame_tmp = start_frame for i, word_info in enumerate(word_info_list): if word_info[2] != '': partial_word_list.append(word_info[2]) if 0 < i < len(word_info_list) - 1 and word_info[ 2] == '' and word_info[ 1] - start_frame_tmp >= length_threshold: divide_points.append( int((word_info[1] + word_info[0]) / 2)) divided_trans.append(' '.join(partial_word_list)) partial_word_list = [] start_frame_tmp = word_info[0] # Last segment if len(partial_word_list) > 0: divided_trans.append(' '.join(partial_word_list)) if len(divide_points) > 0: transcript_list = divided_trans else: transcript_list = [transcript] else: divide_points = [] transcript_list = [transcript] for i_trans, trans in enumerate(transcript_list): # Clean transcript trans = fix_transcript(trans) # Convert space to "_" trans = re.sub(r'\s', SPACE, trans) # Skip silence, laughter, noise, vocalized-noise if trans.replace(NOISE, '').replace(LAUGHTER, '').replace( VOCALIZED_NOISE, '').replace(SPACE, '') == '': continue # Remove the first and last space if trans[0] == SPACE: trans = trans[1:] if trans[-1] == SPACE: trans = trans[:-1] # Count words for word in trans.split(SPACE): vocab_set.add(word) if word not in word_count_dict.keys(): word_count_dict[word] = 0 word_count_dict[word] += 1 # Capital-divided trans_capital = '' for word in trans.split(SPACE): if len(word) == 1: char_capital_set.add(word) trans_capital += word else: # Replace the first character with the capital # letter word = word[0].upper() + word[1:] # Check double-letters for i in range(0, len(word) - 1, 1): if word[i:i + 2] in DOUBLE_LETTERS: char_capital_set.add(word[i:i + 2]) else: char_capital_set.add(word[i]) trans_capital += word for c in list(trans): char_set.add(c) if len(transcript_list) == 1: utterance_dict[utt_index.zfill(4)] = [ start_frame, end_frame, trans ] else: assert len(transcript_list) - 1 == len(divide_points) if i_trans == 0: assert start_frame < divide_points[i_trans] - 1 utterance_dict[utt_index.zfill(4) + '-' + str(i_trans + 1)] = [ start_frame, divide_points[0] - 1, trans ] elif i_trans == len(transcript_list) - 1: assert start_frame < end_frame utterance_dict[utt_index.zfill(4) + '-' + str(i_trans + 1)] = [ divide_points[-1], end_frame, trans ] else: assert divide_points[ i_trans - 1] < divide_points[i_trans] - 1 utterance_dict[utt_index.zfill(4) + '-' + str(i_trans + 1)] = [ divide_points[i_trans - 1], divide_points[i_trans] - 1, trans ] # for debug # print(transcript_original) # print(trans) # print(trans_capital) speaker_dict[speaker] = utterance_dict # Make vocabulary files data_size = '2000h' if merge_with_fisher else '300h' char_vocab_file_path = mkdir_join(vocab_file_save_path, 'character_' + data_size + '.txt') char_capital_vocab_file_path = mkdir_join( vocab_file_save_path, 'character_capital_divide_' + data_size + '.txt') word_freq1_vocab_file_path = mkdir_join(vocab_file_save_path, 'word_freq1_' + data_size + '.txt') word_freq5_vocab_file_path = mkdir_join(vocab_file_save_path, 'word_freq5_' + data_size + '.txt') word_freq10_vocab_file_path = mkdir_join( vocab_file_save_path, 'word_freq10_' + data_size + '.txt') word_freq15_vocab_file_path = mkdir_join( vocab_file_save_path, 'word_freq15_' + data_size + '.txt') # Reserve some indices for mark in [SPACE, HYPHEN, APOSTROPHE, LAUGHTER, NOISE, VOCALIZED_NOISE]: for c in list(mark): char_set.discard(c) for mark in [SPACE, HYPHEN, APOSTROPHE]: for c in list(mark): char_capital_set.discard(c) # for debug # print(sorted(list(char_set))) # print(sorted(list(char_capital_set))) if save_vocab_file: # character-level with open(char_vocab_file_path, 'w') as f: char_list = sorted(list(char_set)) + \ [SPACE, APOSTROPHE, HYPHEN, LAUGHTER, NOISE, VOCALIZED_NOISE] for char in char_list: f.write('%s\n' % char) # character-level (capital-divided) with open(char_capital_vocab_file_path, 'w') as f: char_capital_list = sorted(list(char_capital_set)) + \ [APOSTROPHE, HYPHEN, LAUGHTER, NOISE, VOCALIZED_NOISE] for char in char_capital_list: f.write('%s\n' % char) # word-level (threshold == 1) with open(word_freq1_vocab_file_path, 'w') as f: vocab_list = sorted(list(vocab_set)) + [OOV] for word in vocab_list: f.write('%s\n' % word) # word-level (threshold == 5) with open(word_freq5_vocab_file_path, 'w') as f: vocab_list = sorted([ word for word, freq in list(word_count_dict.items()) if freq >= 5 ]) + [OOV] for word in vocab_list: f.write('%s\n' % word) # word-level (threshold == 10) with open(word_freq10_vocab_file_path, 'w') as f: vocab_list = sorted([ word for word, freq in list(word_count_dict.items()) if freq >= 10 ]) + [OOV] for word in vocab_list: f.write('%s\n' % word) # word-level (threshold == 15) with open(word_freq15_vocab_file_path, 'w') as f: vocab_list = sorted([ word for word, freq in list(word_count_dict.items()) if freq >= 15 ]) + [OOV] for word in vocab_list: f.write('%s\n' % word) # Tokenize print('=====> Tokenize...') char2idx = Char2idx(char_vocab_file_path, double_letter=True) char2idx_capital = Char2idx(char_capital_vocab_file_path, capital_divide=True) word2idx_freq1 = Word2idx(word_freq1_vocab_file_path) word2idx_freq5 = Word2idx(word_freq5_vocab_file_path) word2idx_freq10 = Word2idx(word_freq10_vocab_file_path) word2idx_freq15 = Word2idx(word_freq15_vocab_file_path) for speaker, utt_dict in tqdm(speaker_dict.items()): for utt_index, [start_frame, end_frame, transcript] in utt_dict.items(): char_indices = char2idx(transcript) char_indices_capital = char2idx_capital(transcript) word_freq1_indices = word2idx_freq1(transcript) word_freq5_indices = word2idx_freq5(transcript) word_freq10_indices = word2idx_freq10(transcript) word_freq15_indices = word2idx_freq15(transcript) char_indices = ' '.join(list(map(str, char_indices.tolist()))) char_indices_capital = ' '.join( list(map(str, char_indices_capital.tolist()))) word_freq1_indices = ' '.join( list(map(str, word_freq1_indices.tolist()))) word_freq5_indices = ' '.join( list(map(str, word_freq5_indices.tolist()))) word_freq10_indices = ' '.join( list(map(str, word_freq10_indices.tolist()))) word_freq15_indices = ' '.join( list(map(str, word_freq15_indices.tolist()))) utt_dict[utt_index] = [ start_frame, end_frame, char_indices, char_indices_capital, word_freq1_indices, word_freq5_indices, word_freq10_indices, word_freq15_indices ] speaker_dict[speaker] = utt_dict return speaker_dict
def main(config_path): # Read a config file (.yml) with open(config_path, "r") as f: config = yaml.load(f) corpus = config['corpus'] feature = config['feature'] param = config['param'] # TODO: Solve conflict (batch_norm & layer norm) if corpus['label_type'] == 'phone61': output_size = 61 elif corpus['label_type'] == 'phone48': output_size = 48 elif corpus['label_type'] == 'phone39': output_size = 39 elif corpus['label_type'] == 'character': output_size = 30 # Model setting CTCModel = load(model_type=config['model_name']) network = CTCModel(batch_size=param['batch_size'], input_size=feature['input_size'] * feature['num_stack'], num_cell=param['num_cell'], num_layer=param['num_layer'], output_size=output_size, clip_gradients=param['clip_grad'], clip_activation=param['clip_activation'], dropout_ratio_input=param['dropout_input'], dropout_ratio_hidden=param['dropout_hidden'], num_proj=param['num_proj'], weight_decay=param['weight_decay']) network.model_name = config['model_name'].upper() network.model_name += '_' + str(param['num_cell']) network.model_name += '_' + str(param['num_layer']) network.model_name += '_' + param['optimizer'] network.model_name += '_lr' + str(param['learning_rate']) if param['num_proj'] != 0: network.model_name += '_proj' + str(param['num_proj']) if feature['num_stack'] != 1: network.model_name += '_stack' + str(feature['num_stack']) if param['weight_decay'] != 0: network.model_name += '_weightdecay' + str(param['weight_decay']) # Set save path network.model_dir = mkdir('/n/sd8/inaguma/result/timit/ctc/') network.model_dir = mkdir_join(network.model_dir, corpus['label_type']) network.model_dir = mkdir_join(network.model_dir, network.model_name) # Reset model directory if not isfile(join(network.model_dir, 'complete.txt')): tf.gfile.DeleteRecursively(network.model_dir) tf.gfile.MakeDirs(network.model_dir) else: raise ValueError('File exists.') # Set process name setproctitle('ctc_timit_' + corpus['label_type'] + '_' + param['optimizer']) # Save config file shutil.copyfile(config_path, join(network.model_dir, 'config.yml')) sys.stdout = open(join(network.model_dir, 'train.log'), 'w') print(network.model_name) do_train(network=network, optimizer=param['optimizer'], learning_rate=param['learning_rate'], batch_size=param['batch_size'], epoch_num=param['num_epoch'], label_type=corpus['label_type'], num_stack=feature['num_stack'], num_skip=feature['num_skip']) sys.stdout = sys.__stdout__
def main(data_size): speaker_dict_dict = {} # dict of speaker_dict for data_type in ['train', 'eval1', 'eval2', 'eval3']: print('=' * 50) print(' ' * 20 + data_type + ' (' + data_size + ')' + ' ' * 20) print('=' * 50) ######################################## # labels ######################################## if data_type == 'train': label_paths = path.trans(data_type='train_' + data_size) else: label_paths = path.trans(data_type=data_type) save_vocab_file = True if data_type == 'train' else False is_test = True if 'eval' in data_type else False print('=> Processing transcripts...') speaker_dict_dict[data_type] = read_sdb( label_paths=label_paths, data_size=data_size, vocab_file_save_path=mkdir_join('./config', 'vocab_files'), save_vocab_file=save_vocab_file, is_test=is_test, data_type=data_type) ######################################## # inputs ######################################## print('\n=> Processing input data...') input_save_path = mkdir_join(args.feature_save_path, args.save_format, data_size) if isfile(join(input_save_path, data_type, 'complete.txt')): print('Already exists.') else: if args.save_format == 'wav': ######################################## # Split WAV files per utterance ######################################## if data_type == 'train': wav_paths = path.wav(corpus='train' + data_size) else: wav_paths = path.wav(corpus=data_type) split_wav(wav_paths=wav_paths, speaker_dict=speaker_dict_dict[data_type], save_path=mkdir_join(input_save_path, data_type)) # NOTE: ex.) save_path: # csj/feature/save_format/data_size/data_type/speaker/utt_name.npy elif args.save_format in ['numpy', 'htk']: if data_type == 'train': if args.tool == 'htk': audio_paths = path.htk(data_type='train_' + data_size) else: audio_paths = path.wav(data_type='train_' + data_size) is_training = True global_mean_male, global_std_male, global_mean_female, global_std_female = None, None, None, None else: if args.tool == 'htk': audio_paths = path.htk(data_type=data_type) else: audio_paths = path.wav(data_type=data_type) is_training = False # Load statistics over train dataset global_mean_male = np.load( join(input_save_path, 'train/global_mean_male.npy')) global_std_male = np.load( join(input_save_path, 'train/global_std_male.npy')) global_mean_female = np.load( join(input_save_path, 'train/global_mean_female.npy')) global_std_female = np.load( join(input_save_path, 'train/global_std_female.npy')) read_audio(audio_paths=audio_paths, speaker_dict=speaker_dict_dict[data_type], tool=args.tool, config=CONFIG, normalize=args.normalize, is_training=is_training, save_path=mkdir_join(input_save_path, data_type), save_format=args.save_format, global_mean_male=global_mean_male, global_std_male=global_std_male, global_mean_female=global_mean_female, global_std_female=global_std_female) # NOTE: ex.) save_path: # csj/feature/save_format/data_size/data_type/speaker/*.npy # Make a confirmation file to prove that dataset was saved # correctly with open(join(input_save_path, data_type, 'complete.txt'), 'w') as f: f.write('') ######################################## # dataset (csv) ######################################## print('\n=> Saving dataset files...') for data_type in ['train', 'eval1', 'eval2', 'eval3']: dataset_save_path = mkdir_join(args.dataset_save_path, args.save_format, data_size, data_type) print('---------- %s ----------' % data_type) df_kanji = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_kana = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_phone = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) utt_count = 0 df_kanji_list, df_kana_list, df_phone_list = [], [], [] for speaker, utt_dict in tqdm(speaker_dict_dict[data_type].items()): for utt_index, utt_info in utt_dict.items(): trans_kanji, trans_kana, trans_phone = utt_info[2:] if args.save_format == 'numpy': input_utt_save_path = join( input_save_path, data_type, speaker, speaker + '_' + utt_index + '.npy') assert isfile(input_utt_save_path) input_utt = np.load(input_utt_save_path) elif args.save_format == 'htk': input_utt_save_path = join( input_save_path, data_type, speaker, speaker + '_' + utt_index + '.htk') assert isfile(input_utt_save_path) input_utt, _, _ = read(input_utt_save_path) elif args.save_format == 'wav': input_utt_save_path = path.utt2wav(utt_index) assert isfile(input_utt_save_path) input_utt = w2f_psf(input_utt_save_path, feature_type=CONFIG['feature_type'], feature_dim=CONFIG['channels'], use_energy=CONFIG['energy'], use_delta1=CONFIG['delta'], use_delta2=CONFIG['deltadelta'], window=CONFIG['window'], slide=CONFIG['slide']) else: raise ValueError('save_format is numpy or htk or wav.') frame_num = input_utt.shape[0] del input_utt series_kanji = pd.Series( [frame_num, input_utt_save_path, trans_kanji], index=df_kanji.columns) series_kana = pd.Series( [frame_num, input_utt_save_path, trans_kana], index=df_kana.columns) series_phone = pd.Series( [frame_num, input_utt_save_path, trans_phone], index=df_phone.columns) df_kanji = df_kanji.append(series_kanji, ignore_index=True) df_kana = df_kana.append(series_kana, ignore_index=True) df_phone = df_phone.append(series_phone, ignore_index=True) utt_count += 1 # Reset if utt_count == 50000: df_kanji_list.append(df_kanji) df_kana_list.append(df_kana) df_phone_list.append(df_phone) df_kanji = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_kana = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_phone = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) utt_count = 0 # Last dataframe df_kanji_list.append(df_kanji) df_kana_list.append(df_kana) df_phone_list.append(df_phone) # Concatenate all dataframes df_kanji = df_kanji_list[0] df_kana = df_kana_list[0] df_phone = df_phone_list[0] for df_i in df_kanji_list[1:]: df_kanji = pd.concat([df_kanji, df_i], axis=0) for df_i in df_kana_list[1:]: df_kana = pd.concat([df_kana, df_i], axis=0) for df_i in df_phone_list[1:]: df_phone = pd.concat([df_phone, df_i], axis=0) df_kanji.to_csv(join(dataset_save_path, 'dataset_kanji.csv')) df_kana.to_csv(join(dataset_save_path, 'dataset_kana.csv')) df_phone.to_csv(join(dataset_save_path, 'dataset_phone.csv')) # Use the first 4000 utterances as the dev set if data_type == 'train': df_kanji[:4000].to_csv( mkdir_join(args.dataset_save_path, args.save_format, data_size, 'dev', 'dataset_kanji.csv')) df_kana[:4000].to_csv( mkdir_join(args.dataset_save_path, args.save_format, data_size, 'dev', 'dataset_kana.csv')) df_phone[:4000].to_csv( mkdir_join(args.dataset_save_path, args.save_format, data_size, 'dev', 'dataset_phone.csv'))
def read_audio(audio_paths, tool, config, normalize, is_training, speaker_gender_dict, save_path=None, save_format=None, global_mean_male=None, global_mean_female=None, global_std_male=None, global_std_female=None, dtype=np.float32): """Read audio files. Args: audio_paths (list): paths to HTK or WAV files tool (string): the tool to extract features, htk or librosa or python_speech_features config (dict): a configuration for feature extraction normalize (string): no => normalization will be not conducted global => normalize input features by global mean & std over the training set per gender speaker => normalize input features by mean & std per speaker utterance => normalize input features by mean & std per utterancet data by mean & std per utterance is_training (bool): Set True if save as training set speaker_gender_dict (dict): A dictionary of speakers' gender information key (string) => speaker value (string) => F or M save_path (string): path to save npy files save_format (string, optional): numpy as htk global_mean_male (np.ndarray, optional): global mean of male over the training set global_std_male (np.ndarray, optional): global standard deviation of male over the training set global_mean_female (np.ndarray, optional): global mean of female over the training set global_std_female (np.ndarray, optional): global standard deviation of female over the training set dtype (optional): the type of data, default is np.float32 Returns: global_mean_male (np.ndarray): global mean of male over the training set global_std_male (np.ndarray): global standard deviation of male over the training set global_mean_female (np.ndarray): global mean of female over the training set global_std_female (np.ndarray): global standard deviation of female over the training set frame_num_dict (dict): key => utterance name value => the number of frames """ if not is_training: if global_mean_male is None or global_std_male is None: raise ValueError('Set mean & std computed in the training set.') if normalize not in ['global', 'speaker', 'utterance', 'no']: raise ValueError( 'normalize must be "utterance" or "speaker" or "global" or "no".') if tool not in ['htk', 'python_speech_features', 'librosa']: raise TypeError('tool must be "htk" or "python_speech_features"' + ' or "librosa".') audio_path_dict = {} audio_path_list_male, audio_path_list_female = [], [] total_frame_num_male, total_frame_num_female = 0, 0 total_frame_num_dict = {} speaker_mean_dict, speaker_std_dict = {}, {} # Loop 1: Divide all audio paths into speakers print('=====> Reading audio files...') for i, audio_path in enumerate(tqdm(audio_paths)): # ex.) audio_path: speaker-book-utt_index.*** speaker, book, utt_index = basename(audio_path).split('.')[0].split( '-') if speaker not in audio_path_dict.keys(): audio_path_dict[speaker] = [] audio_path_dict[speaker].append(audio_path) if is_training: # Read each audio file if tool == 'htk': input_utt, sampPeriod, parmKind = read(audio_path) elif tool == 'python_speech_features': input_utt = w2f_psf(audio_path, feature_type=config['feature_type'], feature_dim=config['channels'], use_energy=config['energy'], use_delta1=config['delta'], use_delta2=config['deltadelta'], window=config['window'], slide=config['slide']) elif tool == 'librosa': input_utt = w2f_librosa(audio_path, feature_type=config['feature_type'], feature_dim=config['channels'], use_energy=config['energy'], use_delta1=config['delta'], use_delta2=config['deltadelta'], window=config['window'], slide=config['slide']) input_utt_sum = np.sum(input_utt, axis=0) if i == 0: # Initialize global statistics feature_dim = input_utt.shape[1] global_mean_male = np.zeros((feature_dim, ), dtype=dtype) global_mean_female = np.zeros((feature_dim, ), dtype=dtype) global_std_male = np.zeros((feature_dim, ), dtype=dtype) global_std_female = np.zeros((feature_dim, ), dtype=dtype) # For computing global mean if speaker_gender_dict[speaker] == 'M': audio_path_list_male.append(input_utt) global_mean_male += input_utt_sum total_frame_num_male += input_utt.shape[0] elif speaker_gender_dict[speaker] == 'F': audio_path_list_female.append(input_utt) global_mean_female += input_utt_sum total_frame_num_female += input_utt.shape[0] else: raise ValueError('gender is M or F.') # For computing speaker mean if normalize == 'speaker': if speaker not in total_frame_num_dict.keys(): total_frame_num_dict[speaker] = 0 # Initialize speaker statistics speaker_mean_dict[speaker] = np.zeros((feature_dim, ), dtype=dtype) speaker_std_dict[speaker] = np.zeros((feature_dim, ), dtype=dtype) speaker_mean_dict[speaker] += input_utt_sum total_frame_num_dict[speaker] += input_utt.shape[0] # Loop 2: Computing global mean and sttdev if is_training and normalize != 'no': print('=====> Computing global mean & stddev...') # Compute global mean per gender global_mean_male /= total_frame_num_male global_mean_female /= total_frame_num_female for speaker, audio_paths_speaker in tqdm(audio_path_dict.items()): if normalize == 'speaker': # Compute speaker mean speaker_mean_dict[speaker] /= total_frame_num_dict[speaker] for audio_path in audio_paths_speaker: speaker, book, utt_index = basename(audio_path).split( '.')[0].split('-') # Read each audio file if tool == 'htk': input_utt, sampPeriod, parmKind = read(audio_path) elif tool == 'python_speech_features': input_utt = w2f_psf(audio_path, feature_type=config['feature_type'], feature_dim=config['channels'], use_energy=config['energy'], use_delta1=config['delta'], use_delta2=config['deltadelta'], window=config['window'], slide=config['slide']) elif tool == 'librosa': input_utt = w2f_librosa( audio_path, feature_type=config['feature_type'], feature_dim=config['channels'], use_energy=config['energy'], use_delta1=config['delta'], use_delta2=config['deltadelta'], window=config['window'], slide=config['slide']) # For computing global stddev if speaker_gender_dict[speaker] == 'M': global_std_male += np.sum(np.abs(input_utt - global_mean_male)**2, axis=0) elif speaker_gender_dict[speaker] == 'F': global_std_female += np.sum(np.abs(input_utt - global_mean_female)**2, axis=0) else: raise ValueError('gender is M or F.') if normalize == 'speaker': # For computing speaker stddev speaker_std_dict[speaker] += np.sum( np.abs(input_utt - speaker_mean_dict[speaker])**2, axis=0) if normalize == 'speaker': # Compute speaker stddev speaker_std_dict[speaker] = np.sqrt( speaker_std_dict[speaker] / (total_frame_num_dict[speaker] - 1)) # Compute global stddev per gender global_std_male = np.sqrt(global_std_male / (total_frame_num_male - 1)) global_std_female = np.sqrt(global_std_female / (total_frame_num_female - 1)) if save_path is not None: # Save global mean & std per gender np.save(join(save_path, 'global_mean_male.npy'), global_mean_male) np.save(join(save_path, 'global_mean_female.npy'), global_mean_female) np.save(join(save_path, 'global_std_male.npy'), global_std_male) np.save(join(save_path, 'global_std_female.npy'), global_std_female) # Loop 3: Normalization and Saving print('=====> Normalization...') frame_num_dict = {} for speaker, audio_paths_speaker in tqdm(audio_path_dict.items()): for audio_path in audio_paths_speaker: speaker, book, utt_index = basename(audio_path).split( '.')[0].split('-') # Read each audio file if tool == 'htk': input_utt, sampPeriod, parmKind = read(audio_path) elif tool == 'python_speech_features': input_utt = w2f_psf(audio_path, feature_type=config['feature_type'], feature_dim=config['channels'], use_energy=config['energy'], use_delta1=config['delta'], use_delta2=config['deltadelta'], window=config['window'], slide=config['slide']) elif tool == 'librosa': input_utt = w2f_librosa(audio_path, feature_type=config['feature_type'], feature_dim=config['channels'], use_energy=config['energy'], use_delta1=config['delta'], use_delta2=config['deltadelta'], window=config['window'], slide=config['slide']) if normalize == 'no': pass elif normalize == 'global' or not is_training: # Normalize by mean & std over the training set per gender if speaker_gender_dict[speaker] == 'M': input_utt -= global_mean_male input_utt /= global_std_male elif speaker_gender_dict[speaker] == 'F': input_utt -= global_mean_female input_utt /= global_std_female else: raise ValueError('gender is M or F.') elif normalize == 'speaker': # Normalize by mean & std per speaker input_utt -= speaker_mean_dict[speaker] input_utt /= speaker_std_dict[speaker] elif normalize == 'utterance': # Normalize by mean & std per utterance utt_mean = np.mean(input_utt, axis=0, dtype=dtype) utt_std = np.std(input_utt, axis=0, dtype=dtype) input_utt = (input_utt - utt_mean) / utt_std else: raise ValueError frame_num_dict[speaker + '_' + utt_index] = input_utt.shape[0] if save_path is not None: # Save input features input_name = basename(audio_path).split('.')[0] if save_format == 'numpy': input_data_save_path = mkdir_join(save_path, speaker, input_name + '.npy') np.save(input_data_save_path, input_utt) elif save_format == 'htk': write(input_utt, htk_path=mkdir_join(save_path, speaker, input_name + '.htk'), sampPeriod=sampPeriod, parmKind=parmKind) else: raise ValueError('save_format is numpy or htk.') if save_path is not None: # Save the frame number dictionary with open(join(save_path, 'frame_num.pickle'), 'wb') as f: pickle.dump(frame_num_dict, f) return (global_mean_male, global_mean_female, global_std_male, global_std_female, frame_num_dict)
def read_audio(audio_paths, speaker_dict, tool, config, normalize, is_training, save_path=None, save_format=None, global_mean=None, global_std=None, dtype=np.float32): """Read HTK or WAV files. Args: audio_paths (list): paths to HTK or WAV files speaker_dict (dict): A dictionary of speakers' gender information key (string) => speaker value (dict) => dictionary of utterance information of each speaker key (string) => utterance index value (list) => [start_frame, end_frame, transcript] tool (string): the tool to extract features, htk or librosa or python_speech_features config (dict): a configuration for feature extraction normalize (string): no => normalization will be not conducted global => normalize input features by global mean & std over the training set per gender speaker => normalize input features by mean & std per speaker utterance => normalize input features by mean & std per utterancet data by mean & std per utterance is_training (bool): training or not save_path (string): path to save npy files save_format (string, optional): numpy as htk global_mean (np.ndarray, optional): global mean over the training set global_std (np.ndarray, optional): global standard deviation over the training set dtype (optional): the type of data, default is np.float32 Returns: global_mean (np.ndarray): global mean over the training set global_std (np.ndarray): global standard deviation over the training set frame_num_dict (dict): key => utterance name value => the number of frames """ if not is_training: if global_mean is None or global_std is None: raise ValueError('Set mean & std computed in the training set.') if normalize not in ['global', 'speaker', 'utterance', 'no']: raise ValueError( 'normalize must be "utterance" or "speaker" or "global" or "no".') total_frame_num = 0 total_frame_num_dict = {} speaker_mean_dict = {} # Loop 1: Computing global mean and statistics if is_training and normalize != 'no': print('=====> Reading audio files...') for i, audio_path in enumerate(tqdm(audio_paths)): speaker = basename(audio_path).split('.')[0] # Fix speaker name speaker = speaker.replace('sw0', 'sw') # ex.) sw04771-A => sw4771-A (LDC97S62) speaker = speaker.replace('sw_', 'sw') # ex.) sw_4771-A => sw4771-A (eval2000, swbd) speaker = speaker.replace('en_', 'en') # ex.) en_4156-A => en4156-A (eval2000, ch) # Divide each audio file into utterances _, input_utt_sum, speaker_mean, _, total_frame_num_speaker = segment( audio_path, speaker, speaker_dict[speaker], is_training=True, sil_duration=0, tool=tool, config=config) if i == 0: # Initialize global statistics feature_dim = input_utt_sum.shape[0] global_mean = np.zeros((feature_dim, ), dtype=dtype) global_std = np.zeros((feature_dim, ), dtype=dtype) global_mean += input_utt_sum total_frame_num += total_frame_num_speaker # For computing speaker stddev if normalize == 'speaker': speaker_mean_dict[speaker] = speaker_mean total_frame_num_dict[speaker] = total_frame_num_speaker # NOTE: speaker mean is already computed print('=====> Computing global mean & stddev...') # Compute global mean global_mean /= total_frame_num for audio_path in tqdm(audio_paths): speaker = basename(audio_path).split('.')[0] # Normalize speaker name speaker = speaker.replace('sw0', 'sw') speaker = speaker.replace('sw_', 'sw') speaker = speaker.replace('en_', 'en') # Divide each audio into utterances input_data_dict_speaker, _, _, _, _ = segment( audio_path, speaker, speaker_dict[speaker], is_training=True, sil_duration=0, tool=tool, config=config) # For computing global stddev for input_utt in input_data_dict_speaker.values(): global_std += np.sum(np.abs(input_utt - global_mean)**2, axis=0) # Compute global stddev global_std = np.sqrt(global_std / (total_frame_num - 1)) if save_path is not None: # Save global mean & std per gender np.save(join(save_path, 'global_mean.npy'), global_mean) np.save(join(save_path, 'global_std.npy'), global_std) # Loop 2: Normalization and Saving print('=====> Normalization...') frame_num_dict = {} sampPeriod, parmKind = None, None for audio_path in tqdm(audio_paths): speaker = basename(audio_path).split('.')[0] # Normalize speaker name speaker = speaker.replace('sw0', 'sw') speaker = speaker.replace('sw_', 'sw') speaker = speaker.replace('en_', 'en') if normalize == 'speaker' and is_training: speaker_mean = speaker_mean_dict[speaker] else: speaker_mean = None # Divide each audio into utterances input_data_dict_speaker, _, speaker_mean, speaker_std, _ = segment( audio_path, speaker, speaker_dict[speaker], is_training=is_training, sil_duration=0, tool=tool, config=config, mean=speaker_mean) # for compute speaker sttdev # NOTE: input_data_dict_speaker have been not normalized yet for utt_index, input_utt in input_data_dict_speaker.items(): if normalize == 'no': pass elif normalize == 'global' or not is_training: # Normalize by mean & std over the training set input_utt -= global_mean input_utt /= global_std elif normalize == 'speaker': # Normalize by mean & std per speaker input_utt = (input_utt - speaker_mean) / speaker_std elif normalize == 'utterance': # Normalize by mean & std per utterance utt_mean = np.mean(input_utt, axis=0, dtype=dtype) utt_std = np.std(input_utt, axis=0, dtype=dtype) input_utt = (input_utt - utt_mean) / utt_std else: ValueError frame_num_dict[speaker + '_' + utt_index] = input_utt.shape[0] if save_path is not None: # Save input features if save_format == 'numpy': input_data_save_path = mkdir_join( save_path, speaker, speaker + '_' + utt_index + '.npy') np.save(input_data_save_path, input_utt) elif save_format == 'htk': if sampPeriod is None: _, sampPeriod, parmKind = read(audio_path) write(input_utt, htk_path=mkdir_join( save_path, speaker, speaker + '_' + utt_index + '.htk'), sampPeriod=sampPeriod, parmKind=parmKind) else: raise ValueError('save_format is numpy or htk.') if save_path is not None: # Save the frame number dictionary with open(join(save_path, 'frame_num.pickle'), 'wb') as f: pickle.dump(frame_num_dict, f) return global_mean, global_std, frame_num_dict
def read_trans(label_paths, data_size, vocab_file_save_path, is_test=False, save_vocab_file=False, data_type=None): """Read transcript. Args: label_paths (list): list of paths to label files data_size (string): 100h or 460h or 960h vocab_file_save_path (string): path to vocabulary files is_test (bool, optional): if True, compute OOV rate save_vocab_file (bool, optional): if True, save vocabulary files data_type (string, optional): test_clean or test_other Returns: trans_dict (dict): key (string) => speaker-book-utt_index value (list) => [char_indices, char_indices_capital, word_freq1_indices, word_freq5_indices, word_freq10_indices, word_freq15_indices] """ print('=====> Reading target labels...') speaker_dict = {} char_set, char_capital_set = set([]), set([]) word_count_dict = {} vocab_set = set([]) for label_path in tqdm(label_paths): speaker = label_path.split('/')[-3] if speaker not in speaker_dict.keys(): speaker_dict[speaker] = {} with open(label_path, 'r') as f: for line in f: line = line.strip().lower().split(' ') utt_name = line[0] # ex.) speaker-book-utt_index transcript = ' '.join(line[1:]) word_list = line[1:] # Count words for word in word_list: vocab_set.add(word) if word not in word_count_dict.keys(): word_count_dict[word] = 0 word_count_dict[word] += 1 # Capital-divided for word in transcript.split(' '): if len(word) == 1: char_capital_set.add(word.upper()) else: # Replace the first character with the capital letter word = word[0].upper() + word[1:] char_capital_set.add(word[0].upper()) # Check double-letters skip_flag = False for i in range(1, len(word) - 1, 1): if skip_flag: skip_flag = False continue if not skip_flag and word[i:i + 2] in DOUBLE_LETTERS: char_capital_set.add(word[i:i + 2]) skip_flag = True else: char_capital_set.add(word[i]) # Final character if not skip_flag: char_capital_set.add(word[-1]) # Convert space to "_" transcript = re.sub(r'\s', SPACE, transcript) for c in list(transcript): char_set.add(c) speaker_dict[speaker][utt_name] = transcript # for debug # print(transcript) # print(transcript_capital_divide) # print('-----') # Make vocabulary files char_vocab_file_path = mkdir_join(vocab_file_save_path, 'character_' + data_size + '.txt') char_capital_vocab_file_path = mkdir_join( vocab_file_save_path, 'character_capital_divide_' + data_size + '.txt') word_freq1_vocab_file_path = mkdir_join(vocab_file_save_path, 'word_freq1_' + data_size + '.txt') word_freq5_vocab_file_path = mkdir_join(vocab_file_save_path, 'word_freq5_' + data_size + '.txt') word_freq10_vocab_file_path = mkdir_join( vocab_file_save_path, 'word_freq10_' + data_size + '.txt') word_freq15_vocab_file_path = mkdir_join( vocab_file_save_path, 'word_freq15_' + data_size + '.txt') # Reserve some indices char_set.discard(SPACE) char_set.discard(APOSTROPHE) char_capital_set.discard(APOSTROPHE) # for debug # print(sorted(list(char_set))) # print(sorted(list(char_capital_set))) if save_vocab_file: # character-level with open(char_vocab_file_path, 'w') as f: char_list = sorted(list(char_set)) + [SPACE, APOSTROPHE] for char in char_list: f.write('%s\n' % char) # character-level (capital-divided) with open(char_capital_vocab_file_path, 'w') as f: char_list = sorted(list(char_capital_set)) + [APOSTROPHE] for char in char_list: f.write('%s\n' % char) # word-level (threshold == 1) with open(word_freq1_vocab_file_path, 'w') as f: vocab_list = sorted(list(vocab_set)) + [OOV] for word in vocab_list: f.write('%s\n' % word) # word-level (threshold == 5) with open(word_freq5_vocab_file_path, 'w') as f: vocab_list = sorted([ word for word, freq in list(word_count_dict.items()) if freq >= 5 ]) + [OOV] for word in vocab_list: f.write('%s\n' % word) # word-level (threshold == 10) with open(word_freq10_vocab_file_path, 'w') as f: vocab_list = sorted([ word for word, freq in list(word_count_dict.items()) if freq >= 10 ]) + [OOV] for word in vocab_list: f.write('%s\n' % word) # word-level (threshold == 15) with open(word_freq15_vocab_file_path, 'w') as f: vocab_list = sorted([ word for word, freq in list(word_count_dict.items()) if freq >= 15 ]) + [OOV] for word in vocab_list: f.write('%s\n' % word) # Compute OOV rate if is_test: with open( join(vocab_file_save_path, '../oov_rate_' + data_type + '_' + data_size + '.txt'), 'w') as f: # word-level (threshold == 1) oov_rate = compute_oov_rate(speaker_dict, word_freq1_vocab_file_path) f.write('Word (freq1):\n') f.write(' OOV rate (test): %f %%\n' % oov_rate) # word-level (threshold == 5) oov_rate = compute_oov_rate(speaker_dict, word_freq5_vocab_file_path) f.write('Word (freq5):\n') f.write(' OOV rate (test): %f %%\n' % oov_rate) # word-level (threshold == 10) oov_rate = compute_oov_rate(speaker_dict, word_freq10_vocab_file_path) f.write('Word (freq10):\n') f.write(' OOV rate (test): %f %%\n' % oov_rate) # word-level (threshold == 15) oov_rate = compute_oov_rate(speaker_dict, word_freq15_vocab_file_path) f.write('Word (freq15):\n') f.write(' OOV rate (test): %f %%\n' % oov_rate) # Tokenize print('=====> Tokenize...') char2idx = Char2idx(char_vocab_file_path) char2idx_capital = Char2idx(char_capital_vocab_file_path, capital_divide=True) word2idx_freq1 = Word2idx(word_freq1_vocab_file_path) word2idx_freq5 = Word2idx(word_freq5_vocab_file_path) word2idx_freq10 = Word2idx(word_freq10_vocab_file_path) word2idx_freq15 = Word2idx(word_freq15_vocab_file_path) for speaker, utt_dict in tqdm(speaker_dict.items()): for utt_name, transcript in utt_dict.items(): if is_test: utt_dict[utt_name] = [transcript] * 6 else: char_indices = char2idx(transcript) char_indices_capital = char2idx_capital(transcript) word_freq1_indices = word2idx_freq1(transcript) word_freq5_indices = word2idx_freq5(transcript) word_freq10_indices = word2idx_freq10(transcript) word_freq15_indices = word2idx_freq15(transcript) char_indices = ' '.join(list(map(str, char_indices.tolist()))) char_indices_capital = ' '.join( list(map(str, char_indices_capital.tolist()))) word_freq1_indices = ' '.join( list(map(str, word_freq1_indices.tolist()))) word_freq5_indices = ' '.join( list(map(str, word_freq5_indices.tolist()))) word_freq10_indices = ' '.join( list(map(str, word_freq10_indices.tolist()))) word_freq15_indices = ' '.join( list(map(str, word_freq15_indices.tolist()))) utt_dict[utt_name] = [ char_indices, char_indices_capital, word_freq1_indices, word_freq5_indices, word_freq10_indices, word_freq15_indices ] speaker_dict[speaker] = utt_dict return speaker_dict
def read_char(label_paths, vocab_file_save_path, save_vocab_file=False, is_test=False): """Read text transcript. Args: label_paths (list): list of paths to label files vocab_file_save_path (string): path to vocabulary files save_vocab_file (string): if True, save vocabulary files is_test (bool, optional): set True in case of the test set Returns: trans_dict (dict): key (string) => utterance name value (list) => [char_indices, char_indices_capital] """ print('=====> Reading target labels...') trans_dict = {} char_set, char_capital_set = set([]), set([]) for label_path in tqdm(label_paths): with open(label_path, 'r') as f: line = f.readlines()[-1] speaker = label_path.split('/')[-2] utt_index = basename(label_path).split('.')[0] utt_name = speaker + '_' + utt_index # Remove 「"」, 「:」, 「;」, 「!」, 「?」, 「,」, 「.」, 「-」 # Convert to lowercase line = re.sub(r'[\":;!?,.-]+', '', line.strip().lower()) transcript = ' '.join(line.split(' ')[2:]) # Remove double spaces while ' ' in transcript: transcript = re.sub(r' ', ' ', transcript) # Remove first and last space if transcript[0] == ' ': transcript = transcript[1:] if transcript[-1] == ' ': transcript = transcript[:-1] # Capital-divided for word in transcript.split(' '): if len(word) == 1: char_capital_set.add(word.upper()) else: # Replace the first character with the capital letter word = word[0].upper() + word[1:] char_capital_set.add(word[0].upper()) # Check double-letters skip_flag = False for i in range(1, len(word) - 1, 1): if skip_flag: skip_flag = False continue if not skip_flag and word[i:i + 2] in DOUBLE_LETTERS: char_capital_set.add(word[i:i + 2]) skip_flag = True else: char_capital_set.add(word[i]) # Final character if not skip_flag: char_capital_set.add(word[-1]) # Convert space to "_" transcript = re.sub(r'\s', SPACE, transcript) for c in list(transcript): char_set.add(c) trans_dict[utt_name] = transcript # for debug # print(transcript) # print(trans_char_capital_divide) # Make vocabulary files char_vocab_file_path = mkdir_join(vocab_file_save_path, 'character.txt') char_capital_vocab_file_path = mkdir_join( vocab_file_save_path, 'character_capital_divide.txt') # Reserve some indices char_set.discard(SPACE) char_set.discard(APOSTROPHE) char_capital_set.discard(APOSTROPHE) # for debug # print(sorted(list(char_set))) # print(sorted(list(char_capital_set))) if save_vocab_file: # character-level with open(char_vocab_file_path, 'w') as f: char_list = sorted(list(char_set)) + [SPACE, APOSTROPHE] for char in char_list: f.write('%s\n' % char) # character-level (capital-divided) with open(char_capital_vocab_file_path, 'w') as f: char_capital_list = sorted(list(char_capital_set)) + [APOSTROPHE] for char in char_capital_list: f.write('%s\n' % char) # Tokenize print('=====> Tokenize...') char2idx = Char2idx(char_vocab_file_path) char2idx_capital = Char2idx( char_capital_vocab_file_path, capital_divide=True) for utt_name, transcript in tqdm(trans_dict.items()): if is_test: trans_dict[utt_name] = [transcript, transcript] # NOTE: save as it is else: char_indices = char2idx(transcript) char_indices_capital = char2idx_capital(transcript) char_indices = ' '.join(list(map(str, char_indices.tolist()))) char_indices_capital = ' '.join( list(map(str, char_indices_capital.tolist()))) trans_dict[utt_name] = [char_indices, char_indices_capital] return trans_dict
def read_audio(audio_paths, tool, config, normalize, is_training, save_path=None, save_format=None, global_mean_male=None, global_std_male=None, global_mean_female=None, global_std_female=None, dtype=np.float32): """Read audio files. Args: audio_paths (list): paths to audio files tool (string): the tool to extract features, htk or librosa or python_speech_features config (dict): a configuration for feature extraction normalize (string): no => normalization will be not conducted global => normalize input features by global mean & std over the training set per gender speaker => normalize input features by mean & std per speaker utterance => normalize input features by mean & std per utterancet data by mean & std per utterance is_training (bool, optional): Set True when proccessing the training set save_path (string): path to save npy files save_format (string, optional): numpy as htk global_mean_male (np.ndarray, optional): global mean of male over the training set global_std_male (np.ndarray, optional): global standard deviation of male over the training set global_mean_female (np.ndarray, optional): global mean of female over the training set global_std_female (np.ndarray, optional): global standard deviation of female over the training set dtype (optional): the type of data, default is np.float32 Returns: global_mean_male (np.ndarray): global mean of male over the training set global_std_male (np.ndarray): global standard deviation of male over the training set global_mean_female (np.ndarray): global mean of female over the training set global_std_female (np.ndarray): global standard deviation of female over the training set frame_num_dict (dict): key => utterance name value => the number of frames """ if not is_training: if global_mean_male is None or global_std_male is None: raise ValueError( 'Set global mean & std computed over the training set.') if normalize not in ['global', 'speaker', 'utterance', 'no']: raise ValueError( 'normalize must be "utterance" or "speaker" or "global" or "no".') # Read each audio file print('=====> Reading audio files...') audio_paths_male, audio_paths_female = [], [] input_data_list_male, input_data_list_female = [], [] total_frame_num_male, total_frame_num_female = 0, 0 total_frame_num_dict = {} speaker_mean_dict, speaker_std_dict = {}, {} for audio_path in tqdm(audio_paths): speaker = audio_path.split('/')[-2] gender = speaker[0] # f (female) or m (male) utt_index = basename(audio_path).split('.')[0] if tool == 'htk': input_utt, sampPeriod, parmKind = read(audio_path) # NOTE: audio_path is a htk file path in this case elif tool == 'python_speech_features': input_utt = w2f_psf(audio_path, feature_type=config['feature_type'], feature_dim=config['channels'], use_energy=config['energy'], use_delta1=config['delta'], use_delta2=config['deltadelta'], window=config['window'], slide=config['slide']) elif tool == 'librosa': input_utt = w2f_librosa(audio_path, feature_type=config['feature_type'], feature_dim=config['channels'], use_energy=config['energy'], use_delta1=config['delta'], use_delta2=config['deltadelta'], window=config['window'], slide=config['slide']) # for debug # print(input_utt.shape) if gender == 'm': input_data_list_male.append(input_utt) audio_paths_male.append(audio_path) elif gender == 'f': input_data_list_female.append(input_utt) audio_paths_female.append(audio_path) else: raise ValueError('gender is m or f.') if is_training: speaker = audio_path.split('/')[-2] gender = speaker[0] frame_num_utt, feat_dim = input_utt.shape if gender == 'm': total_frame_num_male += frame_num_utt elif gender == 'f': total_frame_num_female += frame_num_utt else: raise ValueError('gender is m or f.') if normalize == 'speaker': # Initialization if speaker not in total_frame_num_dict.keys(): total_frame_num_dict[speaker] = 0 speaker_mean_dict[speaker] = np.zeros((feat_dim, ), dtype=dtype) speaker_std_dict[speaker] = np.zeros((feat_dim, ), dtype=dtype) total_frame_num_dict[speaker] += frame_num_utt speaker_mean_dict[speaker] += np.sum(input_utt, axis=0) # NOTE: Load all data in advance because TIMIT is a small dataset. if is_training and normalize != 'no': # Compute speaker mean if normalize == 'speaker': for speaker in speaker_mean_dict.keys(): speaker_mean_dict[speaker] /= total_frame_num_dict[speaker] # Compute global mean & std per gender print('=====> Computing global mean & std over the training set...') frame_offset = 0 feat_dim = input_data_list_male[0].shape[1] train_data_male = np.empty((total_frame_num_male, feat_dim)) train_data_female = np.empty((total_frame_num_female, feat_dim)) # male for input_utt, audio_path in zip(tqdm(input_data_list_male), audio_paths_male): speaker = audio_path.split('/')[-2] frame_num_utt = input_utt.shape[0] train_data_male[frame_offset:frame_offset + frame_num_utt] = input_utt frame_offset += frame_num_utt if normalize == 'speaker': speaker_std_dict[speaker] += np.sum( np.abs(input_utt - speaker_mean_dict[speaker])**2, axis=0) # female frame_offset = 0 for input_utt, audio_path in zip(tqdm(input_data_list_female), audio_paths_female): speaker = audio_path.split('/')[-2] frame_num_utt = input_utt.shape[0] train_data_female[frame_offset:frame_offset + frame_num_utt] = input_utt frame_offset += frame_num_utt if normalize == 'speaker': speaker_std_dict[speaker] += np.sum( np.abs(input_utt - speaker_mean_dict[speaker])**2, axis=0) # Compute speaker std if normalize == 'speaker': for speaker in speaker_std_dict.keys(): speaker_std_dict[speaker] = np.sqrt( speaker_std_dict[speaker] / (total_frame_num_dict[speaker] - 1)) global_mean_male = np.mean(train_data_male, axis=0) global_std_male = np.std(train_data_male, axis=0) global_mean_female = np.mean(train_data_female, axis=0) global_std_female = np.std(train_data_female, axis=0) if save_path is not None: # Save global mean & std np.save(join(save_path, 'global_mean_male.npy'), global_mean_male) np.save(join(save_path, 'global_std_male.npy'), global_std_male) np.save(join(save_path, 'global_mean_female.npy'), global_mean_female) np.save(join(save_path, 'global_std_female.npy'), global_std_female) # Save input features as npy files print('=====> Normalization...') frame_num_dict = {} for input_utt, audio_path in zip( tqdm(input_data_list_male + input_data_list_female), audio_paths_male + audio_paths_female): speaker = audio_path.split('/')[-2] utt_index = basename(audio_path).split('.')[0] gender = speaker[0] if normalize == 'no': pass elif normalize == 'global' or not is_training: # Normalize by global mean & std over the training set if gender == 'm': input_utt -= global_mean_male input_utt /= global_std_male elif gender == 'f': input_utt -= global_mean_female input_utt /= global_std_female else: raise ValueError('gender is m or f.') elif normalize == 'speaker': # Normalize by mean & std per speaker input_utt -= speaker_mean_dict[speaker] input_utt /= speaker_std_dict[speaker] elif normalize == 'utterance': # Normalize by mean & std per utterance utt_mean = np.mean(input_utt, axis=0, dtype=dtype) utt_std = np.std(input_utt, axis=0, dtype=dtype) input_utt = (input_utt - utt_mean) / utt_std else: raise ValueError frame_num_dict[speaker + '_' + utt_index] = input_utt.shape[0] if save_path is not None: # Save input features if save_format == 'numpy': np.save( mkdir_join(save_path, speaker, speaker + '_' + utt_index + '.npy'), input_utt) elif save_format == 'htk': write(input_utt, htk_path=mkdir_join(save_path, speaker, speaker + '_' + utt_index + '.htk'), sampPeriod=sampPeriod, parmKind=parmKind) else: raise ValueError('save_format is numpy or htk.') if save_path is not None: # Save the frame number dictionary with open(join(save_path, 'frame_num.pickle'), 'wb') as f: pickle.dump(frame_num_dict, f) return (global_mean_male, global_std_male, global_mean_female, global_std_female, frame_num_dict)
def main(config_path): # Read a config file (.yml) with open(config_path, "r") as f: config = yaml.load(f) corpus = config['corpus'] feature = config['feature'] param = config['param'] if corpus['label_type_main'] == 'character': output_size_main = 147 elif corpus['label_type_main'] == 'kanji': output_size_main = 3386 if corpus['label_type_second'] == 'phone': output_size_second = 38 elif corpus['label_type_second'] == 'character': output_size_second = 147 # Model setting CTCModel = load(model_type=config['model_name']) network = CTCModel( batch_size=param['batch_size'], input_size=feature['input_size'] * feature['num_stack'], num_cell=param['num_cell'], num_layer=param['num_layer'], num_layer2=param['num_layer2'], # bottleneck_dim=param['bottleneck_dim'], output_size=output_size_main, output_size2=output_size_second, main_task_weight=param['main_task_weight'], clip_gradients=param['clip_grad'], clip_activation=param['clip_activation'], dropout_ratio_input=param['dropout_input'], dropout_ratio_hidden=param['dropout_hidden'], num_proj=param['num_proj'], weight_decay=param['weight_decay']) network.model_name = config['model_name'].upper() network.model_name += '_' + str(param['num_cell']) network.model_name += '_' + str(param['num_layer']) network.model_name += '_' + str(param['num_layer2']) network.model_name += '_' + param['optimizer'] network.model_name += '_lr' + str(param['learning_rate']) if param['num_proj'] != 0: network.model_name += '_proj' + str(param['num_proj']) if feature['num_stack'] != 1: network.model_name += '_stack' + str(feature['num_stack']) if param['weight_decay'] != 0: network.model_name += '_weightdecay' + str(param['weight_decay']) network.model_name += '_taskweight' + str(param['main_task_weight']) # Set save path network.model_dir = mkdir('/n/sd8/inaguma/result/csj/monolog/') network.model_dir = mkdir_join(network.model_dir, 'ctc') network.model_dir = mkdir_join( network.model_dir, corpus['label_type_main'] + '_' + corpus['label_type_second']) network.model_dir = mkdir_join(network.model_dir, corpus['train_data_size']) network.model_dir = mkdir_join(network.model_dir, network.model_name) # Reset model directory if not isfile(join(network.model_dir, 'complete.txt')): tf.gfile.DeleteRecursively(network.model_dir) tf.gfile.MakeDirs(network.model_dir) else: raise ValueError('File exists.') # Set process name setproctitle('multitaskctc_csj_' + corpus['label_type_main'] + '_' + corpus['label_type_second'] + '_' + corpus['train_data_size']) # Save config file shutil.copyfile(config_path, join(network.model_dir, 'config.yml')) sys.stdout = open(join(network.model_dir, 'train.log'), 'w') print(network.model_name) do_train(network=network, optimizer=param['optimizer'], learning_rate=param['learning_rate'], batch_size=param['batch_size'], epoch_num=param['num_epoch'], label_type_main=corpus['label_type_main'], label_type_second=corpus['label_type_second'], num_stack=feature['num_stack'], num_skip=feature['num_skip'], train_data_size=corpus['train_data_size']) sys.stdout = sys.__stdout__
def main(data_size): for data_type in [ 'train', 'dev_clean', 'dev_other', 'test_clean', 'test_other' ]: print('=' * 50) print(' ' * 20 + data_type + ' (' + data_size + ')' + ' ' * 20) print('=' * 50) ######################################## # inputs ######################################## print('=> Processing input data...') if args.save_format in ['numpy', 'htk']: input_save_path = mkdir_join(args.feature_save_path, args.save_format, data_size) if isfile(join(input_save_path, data_type, 'complete.txt')): print('Already exists.') else: if data_type == 'train': if args.tool == 'htk': audio_paths = path.htk(data_type='train' + data_size) else: audio_paths = path.wav(data_type='train' + data_size) is_training = True global_mean_male, global_std_male, global_mean_female, global_std_female = None, None, None, None else: if args.tool == 'htk': audio_paths = path.htk(data_type=data_type) else: audio_paths = path.wav(data_type=data_type) is_training = False # Load statistics over train dataset global_mean_male = np.load( join(input_save_path, 'train/global_mean_male.npy')) global_std_male = np.load( join(input_save_path, 'train/global_std_male.npy')) global_mean_female = np.load( join(input_save_path, 'train/global_mean_female.npy')) global_std_female = np.load( join(input_save_path, 'train/global_std_female.npy')) read_audio(audio_paths=audio_paths, tool=args.tool, config=CONFIG, normalize=args.normalize, speaker_gender_dict=path.speaker_gender_dict, is_training=is_training, save_path=mkdir_join(input_save_path, data_type), save_format=args.save_format, global_mean_male=global_mean_male, global_mean_female=global_mean_female, global_std_male=global_std_male, global_std_female=global_std_female) # NOTE: ex.) save_path: # librispeech/feature/save_format/data_size/data_type/speaker/*.npy # Make a confirmation file to prove that dataset was saved # correctly with open(join(input_save_path, data_type, 'complete.txt'), 'w') as f: f.write('') ######################################## # labels ######################################## print('\n=> Processing transcripts...') if data_type == 'train': label_paths = path.trans(data_type='train' + data_size) else: label_paths = path.trans(data_type=data_type) save_vocab_file = True if data_type == 'train' else False is_test = True if 'test' in data_type else False speaker_dict = read_trans(label_paths=label_paths, data_size=data_size, vocab_file_save_path=mkdir_join( './config', 'vocab_files'), save_vocab_file=save_vocab_file, is_test=is_test, data_type=data_type) ######################################## # dataset (csv) ######################################## print('\n=> Saving dataset files...') dataset_save_path = mkdir_join(args.dataset_save_path, args.save_format, data_size, data_type) df_columns = ['frame_num', 'input_path', 'transcript'] df_char = pd.DataFrame([], columns=df_columns) df_char_capital = pd.DataFrame([], columns=df_columns) df_word_freq1 = pd.DataFrame([], columns=df_columns) df_word_freq5 = pd.DataFrame([], columns=df_columns) df_word_freq10 = pd.DataFrame([], columns=df_columns) df_word_freq15 = pd.DataFrame([], columns=df_columns) with open(join(input_save_path, data_type, 'frame_num.pickle'), 'rb') as f: frame_num_dict = pickle.load(f) utt_count = 0 df_char_list, df_char_capital_list = [], [] df_word_freq1_list, df_word_freq5_list = [], [] df_word_freq10_list, df_word_freq15_list = [], [] for speaker, utt_dict in tqdm(speaker_dict.items()): for utt_name, indices_list in utt_dict.items(): if args.save_format == 'numpy': input_utt_save_path = join(input_save_path, data_type, speaker, utt_name + '.npy') elif args.save_format == 'htk': input_utt_save_path = join(input_save_path, data_type, speaker, utt_name + '.htk') elif args.save_format == 'wav': input_utt_save_path = path.utt2wav(utt_name) else: raise ValueError('save_format is numpy or htk or wav.') assert isfile(input_utt_save_path) frame_num = frame_num_dict[utt_name] char_indices, char_indices_capital, word_freq1_indices = indices_list[: 3] word_freq5_indices, word_freq10_indices, word_freq15_indices = indices_list[ 3:6] df_char = add_element( df_char, [frame_num, input_utt_save_path, char_indices]) df_char_capital = add_element( df_char_capital, [frame_num, input_utt_save_path, char_indices_capital]) df_word_freq1 = add_element( df_word_freq1, [frame_num, input_utt_save_path, word_freq1_indices]) df_word_freq5 = add_element( df_word_freq5, [frame_num, input_utt_save_path, word_freq5_indices]) df_word_freq10 = add_element( df_word_freq10, [frame_num, input_utt_save_path, word_freq10_indices]) df_word_freq15 = add_element( df_word_freq15, [frame_num, input_utt_save_path, word_freq15_indices]) utt_count += 1 # Reset if utt_count == 50000: df_char_list.append(df_char) df_char_capital_list.append(df_char_capital) df_word_freq1_list.append(df_word_freq1) df_word_freq5_list.append(df_word_freq5) df_word_freq10_list.append(df_word_freq10) df_word_freq15_list.append(df_word_freq15) df_char = pd.DataFrame([], columns=df_columns) df_char_capital = pd.DataFrame([], columns=df_columns) df_word_freq1 = pd.DataFrame([], columns=df_columns) df_word_freq5 = pd.DataFrame([], columns=df_columns) df_word_freq10 = pd.DataFrame([], columns=df_columns) df_word_freq15 = pd.DataFrame([], columns=df_columns) utt_count = 0 # Last dataframe df_char_list.append(df_char) df_char_capital_list.append(df_char_capital) df_word_freq1_list.append(df_word_freq1) df_word_freq5_list.append(df_word_freq5) df_word_freq10_list.append(df_word_freq10) df_word_freq15_list.append(df_word_freq15) # Concatenate all dataframes df_char = df_char_list[0] df_char_capital = df_char_capital_list[0] df_word_freq1 = df_word_freq1_list[0] df_word_freq5 = df_word_freq5_list[0] df_word_freq10 = df_word_freq10_list[0] df_word_freq15 = df_word_freq15_list[0] for df_i in df_char_list[1:]: df_char = pd.concat([df_char, df_i], axis=0) for df_i in df_char_list[1:]: df_char_capital = pd.concat([df_char_capital, df_i], axis=0) for df_i in df_word_freq1_list[1:]: df_word_freq1 = pd.concat([df_word_freq1, df_i], axis=0) for df_i in df_word_freq5_list[1:]: df_word_freq5 = pd.concat([df_word_freq5, df_i], axis=0) for df_i in df_word_freq10_list[1:]: df_word_freq10 = pd.concat([df_word_freq10, df_i], axis=0) for df_i in df_word_freq15_list[1:]: df_word_freq15 = pd.concat([df_word_freq15, df_i], axis=0) df_char.to_csv(join(dataset_save_path, 'character.csv')) df_char_capital.to_csv( join(dataset_save_path, 'character_capital_divide.csv')) df_word_freq1.to_csv(join(dataset_save_path, 'word_freq1.csv')) df_word_freq5.to_csv(join(dataset_save_path, 'word_freq5.csv')) df_word_freq10.to_csv(join(dataset_save_path, 'word_freq10.csv')) df_word_freq15.to_csv(join(dataset_save_path, 'word_freq15.csv'))
def read_audio(audio_paths, speaker_dict, tool, config, normalize, is_training, save_path=None, save_format='numpy', global_mean_male=None, global_mean_female=None, global_std_male=None, global_std_female=None, dtype=np.float32): """Read HTK or WAV files. Args: audio_paths (list): paths to HTK or WAV files speaker_dict (dict): dictionary of speakers key => speaker value => dictionary of utterance information of each speaker key => utterance index value => [start_frame, end_frame, trans_kana, trans_kanji] tool (string): the tool to extract features, htk or librosa or python_speech_features config (dict): a configuration for feature extraction normalize (string): no => normalization will be not conducted global => normalize input features by global mean & std over the training set per gender speaker => normalize input features by mean & std per speaker utterance => normalize input features by mean & std per utterancet data by mean & std per utterance is_training (bool, optional): training or not save_path (string): path to save npy files save_format (string, optional): numpy or htk global_mean_male (np.ndarray, optional): global mean of male over the training set global_std_male (np.ndarray, optional): global standard deviation of male over the training set global_mean_female (np.ndarray, optional): global mean of female over the training set global_std_female (np.ndarray, optional): global standard deviation of female over the training set dtype (optional): the type of data, default is np.float32 Returns: global_mean_male (np.ndarray): global mean of male over the training set global_std_male (np.ndarray): global standard deviation of male over the training set global_mean_female (np.ndarray): global mean of female over the training set global_std_female (np.ndarray): global standard deviation of female over the training set frame_num_dict (dict): key => utterance name value => the number of frames """ if not is_training: if global_mean_male is None or global_mean_female is None: raise ValueError('Set mean & std computed in the training set.') if normalize not in ['global', 'speaker', 'utterance', 'no']: raise ValueError( 'normalize must be "utterance" or "speaker" or "global" or "no".') if tool not in ['htk', 'python_speech_features', 'librosa']: raise TypeError('tool must be "htk" or "python_speech_features"' + ' or "librosa".') audio_path_list_male, audio_path_list_female = [], [] total_frame_num_male, total_frame_num_female = 0, 0 total_frame_num_dict = {} speaker_mean_dict = {} # NOTE: 講演ごとに異なるspeakerとみなす # Loop 1: Computing global mean and statistics if is_training and normalize != 'no': print('=====> Reading audio files...') for i, audio_path in enumerate(tqdm(audio_paths)): speaker = basename(audio_path).split('.')[0] # Divide each audio file into utterances _, input_utt_sum, speaker_mean, _, total_frame_num_speaker = segment( audio_path, speaker, speaker_dict[speaker], is_training=True, sil_duration=0, tool=tool, config=config) if i == 0: # Initialize global statistics feature_dim = input_utt_sum.shape[0] global_mean_male = np.zeros((feature_dim, ), dtype=dtype) global_mean_female = np.zeros((feature_dim, ), dtype=dtype) global_std_male = np.zeros((feature_dim, ), dtype=dtype) global_std_female = np.zeros((feature_dim, ), dtype=dtype) # For computing global mean if speaker[3] == 'M': audio_path_list_male.append(audio_path) global_mean_male += input_utt_sum total_frame_num_male += total_frame_num_speaker elif speaker[3] == 'F': audio_path_list_female.append(audio_path) global_mean_female += input_utt_sum total_frame_num_female += total_frame_num_speaker else: raise ValueError # For computing speaker stddev if normalize == 'speaker': speaker_mean_dict[speaker] = speaker_mean total_frame_num_dict[speaker] = total_frame_num_speaker # NOTE: speaker mean is already computed print('=====> Computing global mean & stddev...') # Compute global mean per gender global_mean_male /= total_frame_num_male global_mean_female /= total_frame_num_female for audio_path in tqdm(audio_paths): speaker = basename(audio_path).split('.')[0] # Divide each audio into utterances input_data_dict_speaker, _, _, _, _ = segment( audio_path, speaker, speaker_dict[speaker], is_training=True, sil_duration=0, tool=tool, config=config) # For computing global stddev if speaker[3] == 'M': for input_utt in input_data_dict_speaker.values(): global_std_male += np.sum(np.abs(input_utt - global_mean_male)**2, axis=0) elif speaker[3] == 'F': for input_utt in input_data_dict_speaker.values(): global_std_female += np.sum(np.abs(input_utt - global_mean_female)**2, axis=0) else: raise ValueError # Compute global stddev per gender global_std_male = np.sqrt(global_std_male / (total_frame_num_male - 1)) global_std_female = np.sqrt(global_std_female / (total_frame_num_female - 1)) if save_path is not None: # Save global mean & std per gender np.save(join(save_path, 'global_mean_male.npy'), global_mean_male) np.save(join(save_path, 'global_mean_female.npy'), global_mean_female) np.save(join(save_path, 'global_std_male.npy'), global_std_male) np.save(join(save_path, 'global_std_female.npy'), global_std_female) # Loop 2: Normalization and Saving print('=====> Normalization...') frame_num_dict = {} sampPeriod, parmKind = None, None for audio_path in tqdm(audio_paths): speaker = basename(audio_path).split('.')[0] if normalize == 'speaker' and is_training: speaker_mean = speaker_mean_dict[speaker] else: speaker_mean = None # Divide each audio into utterances input_data_dict_speaker, _, speaker_mean, speaker_std, _ = segment( audio_path, speaker, speaker_dict[speaker], is_training=is_training, sil_duration=0, tool=tool, config=config, mean=speaker_mean) # for compute speaker sttdev # NOTE: input_data_dict_speaker have been not normalized yet for utt_index, input_utt in input_data_dict_speaker.items(): if normalize == 'no': pass elif normalize == 'global' or not is_training: # Normalize by mean & std over the training set per gender if speaker[3] == 'M': input_utt -= global_mean_male input_utt /= global_std_male elif speaker[3] == 'F': input_utt -= global_mean_female input_utt /= global_std_female else: raise ValueError elif normalize == 'speaker': # Normalize by mean & std per speaker input_utt = (input_utt - speaker_mean) / speaker_std elif normalize == 'utterance': # Normalize by mean & std per utterance utt_mean = np.mean(input_utt, axis=0, dtype=dtype) utt_std = np.std(input_utt, axis=0, dtype=dtype) input_utt = (input_utt - utt_mean) / utt_std else: raise ValueError frame_num_dict[speaker + '_' + utt_index] = input_utt.shape[0] if save_path is not None: # Save input features if save_format == 'numpy': input_data_save_path = mkdir_join( save_path, speaker, speaker + '_' + utt_index + '.npy') np.save(input_data_save_path, input_utt) elif save_format == 'htk': if sampPeriod is None: _, sampPeriod, parmKind = read(audio_path) write(input_utt, htk_path=mkdir_join( save_path, speaker, speaker + '_' + utt_index + '.htk'), sampPeriod=sampPeriod, parmKind=parmKind) else: raise ValueError('save_format is numpy or htk.') if save_path is not None: # Save the frame number dictionary with open(join(save_path, 'frame_num.pickle'), 'wb') as f: pickle.dump(frame_num_dict, f) return (global_mean_male, global_mean_female, global_std_male, global_std_female, frame_num_dict)
def main(): for data_type in ['train', 'dev', 'test']: print('=' * 50) print(' ' * 20 + data_type + ' ' * 20) print('=' * 50) ######################################## # inputs ######################################## print('=> Processing input data...') if args.save_format in ['numpy', 'htk']: input_save_path = mkdir_join( args.feature_save_path, args.save_format) if isfile(join(input_save_path, data_type, 'complete.txt')): print('Already exists.') else: if args.tool == 'htk': audio_paths = path.htk(data_type=data_type) else: audio_paths = path.wav(data_type=data_type) if data_type != 'train': is_training = False # Load statistics over train dataset global_mean_male = np.load( join(input_save_path, 'train/global_mean_male.npy')) global_std_male = np.load( join(input_save_path, 'train/global_std_male.npy')) global_mean_female = np.load( join(input_save_path, 'train/global_mean_female.npy')) global_std_female = np.load( join(input_save_path, 'train/global_std_female.npy')) else: is_training = True global_mean_male, global_std_male, global_mean_female, global_std_female = None, None, None, None # Read htk or wav files, and save input data and frame num dict read_audio(audio_paths=audio_paths, tool=args.tool, config=CONFIG, normalize=args.normalize, is_training=is_training, save_path=mkdir_join(input_save_path, data_type), save_format=args.save_format, global_mean_male=global_mean_male, global_std_male=global_std_male, global_mean_female=global_mean_female, global_std_female=global_std_female) # NOTE: ex.) save_path: # timit/feature/save_format/data_type/*.npy # Make a confirmation file to prove that dataset was saved # correctly with open(join(input_save_path, data_type, 'complete.txt'), 'w') as f: f.write('') ######################################## # labels (phone) ######################################## print('\n=> Processing transcripts (phone)...') save_vocab_file = True if data_type == 'train' else False is_test = True if data_type == 'test' else False trans_dict = read_phone( label_paths=path.phone(data_type=data_type), vocab_file_save_path=mkdir_join('./config', 'vocab_files'), save_vocab_file=save_vocab_file, is_test=is_test) ######################################## # dataset (phone, csv) ######################################## print('\n=> Saving dataset files (phone)...') dataset_save_path = mkdir_join( args.dataset_save_path, args.save_format, data_type) df_columns = ['frame_num', 'input_path', 'transcript'] df_phone61 = pd.DataFrame([], columns=df_columns) df_phone48 = pd.DataFrame([], columns=df_columns) df_phone39 = pd.DataFrame([], columns=df_columns) with open(join(input_save_path, data_type, 'frame_num.pickle'), 'rb') as f: frame_num_dict = pickle.load(f) for utt_name, trans_list in tqdm(trans_dict.items()): if args.save_format == 'numpy': speaker = utt_name.split('_')[0] input_utt_save_path = join( input_save_path, data_type, speaker, utt_name + '.npy') elif args.save_format == 'htk': speaker = utt_name.split('_')[0] input_utt_save_path = join( input_save_path, data_type, speaker, utt_name + '.htk') elif args.save_format == 'wav': input_utt_save_path = path.utt2wav(utt_name) else: raise ValueError('save_format is numpy or htk or wav.') assert isfile(input_utt_save_path) frame_num = frame_num_dict[utt_name] phone61_indices, phone48_indices, phone39_indices = trans_list df_phone61 = add_element( df_phone61, [frame_num, input_utt_save_path, phone61_indices]) df_phone48 = add_element( df_phone48, [frame_num, input_utt_save_path, phone48_indices]) df_phone39 = add_element( df_phone39, [frame_num, input_utt_save_path, phone39_indices]) df_phone61.to_csv(join(dataset_save_path, 'phone61.csv')) df_phone48.to_csv(join(dataset_save_path, 'phone48.csv')) df_phone39.to_csv(join(dataset_save_path, 'phone39.csv'))
def read_sdb(label_paths, data_size, vocab_file_save_path, is_test=False, save_vocab_file=False, data_type=None): """Read transcripts (.sdb) & save files (.npy). Args: label_paths (list): list of paths to label files data_size (string): fullset or subset vocab_file_save_path (string): path to vocabulary files is_test (bool, optional): Set True if save as the test set save_vocab_file (bool, optional): if True, save vocabulary files data_type (string, optional): eval1 or eval2 or eval3 Returns: speaker_dict (dict): the dictionary of utterances of each speaker key (string) => speaker value (dict) => the dictionary of utterance information of each speaker key (string) => utterance index value (list) => [start_frame, end_frame, kanji_indices, kanji_div_indices, kana_indices, kana_div_indices, phone_indices, phone_div_indices, word_freq1_indices, word_freq5_indices, word_freq10_indices, word_freq15_indices] """ # Make mapping dictionary from kana to phone kana_list = [] kana2phone_dict = {} phone_set = set([]) with open(join(vocab_file_save_path, '../kana2phone.txt'), 'r') as f: for line in f: line = line.strip().split('+') kana, phone_seq = line kana_list.append(kana) kana2phone_dict[kana] = phone_seq for phone in phone_seq.split(' '): phone_set.add(phone) kana2phone_dict[SPACE] = SIL print('=====> Reading target labels...') speaker_dict = OrderedDict() char_set = set([]) word_count_dict = {} vocab_set = set([]) for label_path in tqdm(label_paths): col_names = [j for j in range(25)] df = pd.read_csv(label_path, names=col_names, encoding='SHIFT-JIS', delimiter='\t', header=None) utt_dict = OrderedDict() utt_index_pre = 1 start_frame_pre, end_frame_pre = None, None trans_kana, trans_kanji, trans_pos = '', '', '' speaker = basename(label_path).split('.')[0] for key, row in df.iterrows(): # From kaldi time = row[3] # Time information for segment word = row[5] # Word # num = row[9] # Number and point # About morpheme if isinstance(row[11], str): pos = row[11] # Part Of Speech else: pos = '' # acf = row[12] # A Conjugated Form # kacf = row[13] # Kind of A Conjugated Form # kav = row[14] # Kind of Aulxiliary Verb # ec = row[15] # Euphonic Change # other = row[16] # Other information pron = row[10] # Pronunciation for lexicon utt_index = int(time.split(' ')[0]) segment = time.split(' ')[1].split('-') start_frame = int(float(segment[0]) * 100 + 0.5) end_frame = int(float(segment[1]) * 100 + 0.5) if start_frame_pre is None: start_frame_pre = start_frame if end_frame_pre is None: end_frame_pre = end_frame # Stack word in the same utterance if utt_index == utt_index_pre: trans_kanji += word + ' ' trans_kana += pron + ' ' if pos != '': trans_pos += pos + ' ' utt_index_pre = utt_index end_frame_pre = end_frame continue # Count the number of brackets if trans_kanji.count('(') != trans_kanji.count(')'): trans_kanji += word + ' ' trans_kana += pron + ' ' if pos != '': trans_pos += pos + ' ' utt_index_pre = utt_index end_frame_pre = end_frame continue if trans_kana.count('(') != trans_kana.count(')'): trans_kanji += word + ' ' trans_kana += pron + ' ' if pos != '': trans_pos += pos + ' ' utt_index_pre = utt_index end_frame_pre = end_frame continue # if '<P:' in trans_kana: # print(label_path) # print(trans_kanji) # print(trans_kana) # Clean transcript trans_kanji = fix_transcript(trans_kanji) trans_kana = fix_transcript(trans_kana) # Remove double space while ' ' in trans_kanji: trans_kanji = re.sub(r'[\s]+', ' ', trans_kanji) while ' ' in trans_kana: trans_kana = re.sub(r'[\s]+', ' ', trans_kana) while ' ' in trans_pos: trans_pos = re.sub(r'[\s]+', ' ', trans_pos) # Skip silence only utterance if trans_kanji.replace(' ', '') != '' and len(trans_pos) > 0: # Remove the first and last space if len(trans_kanji) > 0 and trans_kanji[0] == ' ': trans_kanji = trans_kanji[1:] if len(trans_kana) > 0 and trans_kana[0] == ' ': trans_kana = trans_kana[1:] if len(trans_kanji) > 0 and trans_kanji[-1] == ' ': trans_kanji = trans_kanji[:-1] if len(trans_kana) > 0 and trans_kana[-1] == ' ': trans_kana = trans_kana[:-1] # Convert space to "_" trans_kanji = re.sub(r'\s', SPACE, trans_kanji) trans_kana = re.sub(r'\s', SPACE, trans_kana) # For exception if trans_kana[0:2] == 'Z_': trans_kana = trans_kana[2:] for c in list(trans_kanji): char_set.add(c) # Count words word_list = trans_kanji.split(SPACE) for w in word_list: vocab_set.add(w) if w not in word_count_dict.keys(): word_count_dict[w] = 0 word_count_dict[w] += 1 # Convert kana character to phone trans_phone = ' '.join(kana2phone(trans_kana, kana2phone_dict)) utt_dict[str(utt_index - 1).zfill(4)] = [ start_frame_pre, end_frame_pre, trans_kanji, trans_kana, trans_phone ] # for debug # print(trans_kanji) # print(trans_kana) # print(trans_phone) # print('-----') # Initialization trans_kanji = word + ' ' trans_kana = pron + ' ' if pos == '': trans_pos = '' else: trans_pos = pos + ' ' utt_index_pre = utt_index start_frame_pre = start_frame end_frame_pre = end_frame # Register all utterances of each speaker speaker_dict[speaker] = utt_dict # Make vocabulary files kanji_vocab_file_path = mkdir_join(vocab_file_save_path, 'kanji_' + data_size + '.txt') kanji_div_vocab_file_path = mkdir_join( vocab_file_save_path, 'kanji_divide_' + data_size + '.txt') kana_vocab_file_path = mkdir_join(vocab_file_save_path, 'kana_' + data_size + '.txt') kana_div_vocab_file_path = mkdir_join(vocab_file_save_path, 'kana_divide_' + data_size + '.txt') phone_vocab_file_path = mkdir_join(vocab_file_save_path, 'phone_' + data_size + '.txt') phone_div_vocab_file_path = mkdir_join( vocab_file_save_path, 'phone_divide_' + data_size + '.txt') word_freq1_vocab_file_path = mkdir_join(vocab_file_save_path, 'word_freq1_' + data_size + '.txt') word_freq5_vocab_file_path = mkdir_join(vocab_file_save_path, 'word_freq5_' + data_size + '.txt') word_freq10_vocab_file_path = mkdir_join( vocab_file_save_path, 'word_freq10_' + data_size + '.txt') word_freq15_vocab_file_path = mkdir_join( vocab_file_save_path, 'word_freq15_' + data_size + '.txt') # Reserve some indices char_set.discard(SPACE) # for debug # print(sorted(list(char_set))) if save_vocab_file: # character-level (kanji, kanji_divide) kanji_set = set([]) for char in char_set: if (not is_hiragana(char)) and (not is_katakana(char)): kanji_set.add(char) for kana in kana_list: kanji_set.add(kana) kanji_set.add(jaconv.kata2hira(kana)) with open(kanji_vocab_file_path, 'w') as f, open(kanji_div_vocab_file_path, 'w') as f_div: kanji_list = sorted(list(kanji_set)) for kanji in kanji_list: f.write('%s\n' % kanji) for kanji in kanji_list + [SPACE]: f_div.write('%s\n' % kanji) # character-level (kana, kana_divide) with open(kana_vocab_file_path, 'w') as f, open(kana_div_vocab_file_path, 'w') as f_div: kana_list_tmp = sorted(kana_list) for kana in kana_list_tmp: f.write('%s\n' % kana) for kana in kana_list_tmp + [SPACE]: f_div.write('%s\n' % kana) # phone-level (phone, phone_divide) with open(phone_vocab_file_path, 'w') as f, open(phone_div_vocab_file_path, 'w') as f_div: phone_list = sorted(list(phone_set)) for phone in phone_list: f.write('%s\n' % phone) for phone in phone_list + [SIL]: f_div.write('%s\n' % phone) # word-level (threshold == 1) with open(word_freq1_vocab_file_path, 'w') as f: vocab_list = sorted(list(vocab_set)) + [OOV] for word in vocab_list: f.write('%s\n' % word) # word-level (threshold == 5) with open(word_freq5_vocab_file_path, 'w') as f: vocab_list = sorted([ word for word, freq in list(word_count_dict.items()) if freq >= 5 ]) + [OOV] for word in vocab_list: f.write('%s\n' % word) # word-level (threshold == 10) with open(word_freq10_vocab_file_path, 'w') as f: vocab_list = sorted([ word for word, freq in list(word_count_dict.items()) if freq >= 10 ]) + [OOV] for word in vocab_list: f.write('%s\n' % word) # word-level (threshold == 15) with open(word_freq15_vocab_file_path, 'w') as f: vocab_list = sorted([ word for word, freq in list(word_count_dict.items()) if freq >= 15 ]) + [OOV] for word in vocab_list: f.write('%s\n' % word) # Compute OOV rate if is_test: with open( join(vocab_file_save_path, '../oov_rate_' + data_type + '_' + data_size + '.txt'), 'w') as f: # word-level (threshold == 1) oov_rate = compute_oov_rate(speaker_dict, word_freq1_vocab_file_path) f.write('Word (freq1):\n') f.write(' OOV rate (test): %f %%\n' % oov_rate) # word-level (threshold == 5) oov_rate = compute_oov_rate(speaker_dict, word_freq5_vocab_file_path) f.write('Word (freq5):\n') f.write(' OOV rate (test): %f %%\n' % oov_rate) # word-level (threshold == 10) oov_rate = compute_oov_rate(speaker_dict, word_freq10_vocab_file_path) f.write('Word (freq10):\n') f.write(' OOV rate (test): %f %%\n' % oov_rate) # word-level (threshold == 15) oov_rate = compute_oov_rate(speaker_dict, word_freq15_vocab_file_path) f.write('Word (freq15):\n') f.write(' OOV rate (test): %f %%\n' % oov_rate) # Tokenize print('=====> Tokenize...') kanji2idx = Char2idx(kanji_vocab_file_path, double_letter=True) kanji2idx_div = Char2idx(kanji_div_vocab_file_path, double_letter=True) kana2idx = Char2idx(kana_vocab_file_path, double_letter=True) kana2idx_div = Char2idx(kana_div_vocab_file_path, double_letter=True) phone2idx = Phone2idx(phone_vocab_file_path) phone2idx_div = Phone2idx(phone_div_vocab_file_path) word2idx_freq1 = Word2idx(word_freq1_vocab_file_path) word2idx_freq5 = Word2idx(word_freq5_vocab_file_path) word2idx_freq10 = Word2idx(word_freq10_vocab_file_path) word2idx_freq15 = Word2idx(word_freq15_vocab_file_path) for speaker, utt_dict in tqdm(speaker_dict.items()): for utt_index, utt_info in utt_dict.items(): start_frame, end_frame, trans_kanji, trans_kana, trans_phone = utt_info if is_test: utt_dict[utt_index] = [ start_frame, end_frame, trans_kanji.replace(SPACE, ''), trans_kanji, trans_kana.replace(SPACE, ''), trans_kana, trans_phone.replace(SIL, '').replace(' ', ' '), trans_phone, trans_kanji, trans_kanji, trans_kanji, trans_kanji ] else: kanji_indices = kanji2idx(trans_kanji.replace(SPACE, '')) kanji_div_indices = kanji2idx_div(trans_kanji) kana_indices = kana2idx(trans_kana.replace(SPACE, '')) kana_div_indices = kana2idx_div(trans_kana) phone_indices = phone2idx( trans_phone.replace(SIL, '').replace(' ', ' ')) phone_div_indices = phone2idx_div(trans_phone) word_freq1_indices = word2idx_freq1(trans_kanji) word_freq5_indices = word2idx_freq5(trans_kanji) word_freq10_indices = word2idx_freq10(trans_kanji) word_freq15_indices = word2idx_freq15(trans_kanji) kanji_indices = int2str(kanji_indices) kanji_div_indices = int2str(kanji_div_indices) kana_indices = int2str(kana_indices) kana_div_indices = int2str(kana_div_indices) phone_indices = int2str(phone_indices) phone_div_indices = int2str(phone_div_indices) word_freq1_indices = int2str(word_freq1_indices) word_freq5_indices = int2str(word_freq5_indices) word_freq10_indices = int2str(word_freq10_indices) word_freq15_indices = int2str(word_freq15_indices) utt_dict[utt_index] = [ start_frame, end_frame, kanji_indices, kanji_div_indices, kana_indices, kana_div_indices, phone_indices, phone_div_indices, word_freq1_indices, word_freq5_indices, word_freq10_indices, word_freq15_indices ] speaker_dict[speaker] = utt_dict return speaker_dict
def read_phone(label_paths, vocab_file_save_path, save_vocab_file=False, is_test=False): """Read phone transcript. Args: label_paths (list): list of paths to label files vocab_file_save_path (string): path to vocabulary files save_vocab_file (bool, optional): if True, save vocabulary files is_test (bool, optional): set True in case of the test set Returns: text_dict (dict): key (string) => utterance name value (list) => list of [phone61_indices, phone48_indices, phone39_indices] """ print('=====> Reading target labels...') # Make the mapping file (from phone to index) phone2phone_map_file_path = join(vocab_file_save_path, '../phone2phone.txt') phone61_set, phone48_set, phone39_set = set([]), set([]), set([]) with open(phone2phone_map_file_path, 'r') as f: for line in f: line = line.strip().split() if line[1] != 'nan': phone61_set.add(line[0]) phone48_set.add(line[1]) phone39_set.add(line[2]) else: # Ignore "q" if phone39 or phone48 phone61_set.add(line[0]) phone61_vocab_map_file_path = mkdir_join(vocab_file_save_path, 'phone61.txt') phone48_vocab_map_file_path = mkdir_join(vocab_file_save_path, 'phone48.txt') phone39_vocab_map_file_path = mkdir_join(vocab_file_save_path, 'phone39.txt') # Save mapping file if save_vocab_file: with open(phone61_vocab_map_file_path, 'w') as f: for phone in sorted(list(phone61_set)): f.write('%s\n' % phone) with open(phone48_vocab_map_file_path, 'w') as f: for phone in sorted(list(phone48_set)): f.write('%s\n' % phone) with open(phone39_vocab_map_file_path, 'w') as f: for phone in sorted(list(phone39_set)): f.write('%s\n' % phone) trans_dict = {} for label_path in tqdm(label_paths): speaker = label_path.split('/')[-2] utt_index = basename(label_path).split('.')[0] utt_name = speaker + '_' + utt_index phone61_list = [] with open(label_path, 'r') as f: for line in f: line = line.strip().split(' ') # start_frame = line[0] # end_frame = line[1] phone61_list.append(line[2]) # Map from 61 phones to the corresponding phones phone48_list = map_phone2phone(phone61_list, 'phone48', phone2phone_map_file_path) phone39_list = map_phone2phone(phone61_list, 'phone39', phone2phone_map_file_path) # Convert to string trans_phone61 = ' '.join(phone61_list) trans_phone48 = ' '.join(phone48_list) trans_phone39 = ' '.join(phone39_list) # for debug # print(trans_phone61) # print(trans_phone48) # print(trans_phone39) # print('-----') trans_dict[utt_name] = [trans_phone61, trans_phone48, trans_phone39] # Tokenize print('=====> Tokenize...') phone2idx_61 = Phone2idx(phone61_vocab_map_file_path) phone2idx_48 = Phone2idx(phone48_vocab_map_file_path) phone2idx_39 = Phone2idx(phone39_vocab_map_file_path) for utt_name, [trans_phone61, trans_phone48, trans_phone39] in tqdm(trans_dict.items()): if is_test: trans_dict[utt_name] = [ trans_phone61, trans_phone48, trans_phone39 ] # NOTE: save as it is else: phone61_indices = phone2idx_61(trans_phone61) phone48_indices = phone2idx_48(trans_phone48) phone39_indices = phone2idx_39(trans_phone39) phone61_indices = ' '.join(list(map(str, phone61_indices.tolist()))) phone48_indices = ' '.join(list(map(str, phone48_indices.tolist()))) phone39_indices = ' '.join(list(map(str, phone39_indices.tolist()))) trans_dict[utt_name] = [ phone61_indices, phone48_indices, phone39_indices ] return trans_dict
def main(data_size): print('=' * 50) print(' data_size: %s' % data_size) print('=' * 50) ######################################## # labels ######################################## print('=> Processing transcripts...') speaker_dict_dict = {} # dict of speaker_dict print('---------- train ----------') if data_size == '300h': speaker_dict_dict['train'] = read_trans( label_paths=path.trans(corpus='swbd'), word_boundary_paths=path.word(corpus='swbd'), run_root_path='./', vocab_file_save_path=mkdir_join('./config/vocab_files'), save_vocab_file=True) elif data_size == '2000h': speaker_dict_a, char_set_a, char_capital_set_a, word_count_dict_a = read_trans_fisher( label_paths=path.trans(corpus='fisher'), target_speaker='A') speaker_dict_b, char_set_b, char_capital_set_b, word_count_dict_b = read_trans_fisher( label_paths=path.trans(corpus='fisher'), target_speaker='B') # Meage 2 dictionaries speaker_dict = merge_dicts([speaker_dict_a, speaker_dict_b]) char_set = char_set_a | char_set_b char_capital_set = char_capital_set_a | char_capital_set_b word_count_dict_fisher = dict( Counter(word_count_dict_a) + Counter(word_count_dict_b)) speaker_dict_dict['train'] = read_trans( label_paths=path.trans(corpus='swbd'), word_boundary_paths=path.word(corpus='swbd'), run_root_path='./', vocab_file_save_path=mkdir_join('./config/vocab_files'), save_vocab_file=True, speaker_dict_fisher=speaker_dict, char_set=char_set, char_capital_set=char_capital_set, word_count_dict=word_count_dict_fisher) del speaker_dict print('---------- eval2000 (swbd + ch) ----------') speaker_dict_dict['eval2000_swbd'], speaker_dict_dict['eval2000_ch'] = read_stm( stm_path=path.stm_path, pem_path=path.pem_path, glm_path=path.glm_path, run_root_path='./') ######################################## # inputs ######################################## print('\n=> Processing input data...') input_save_path = mkdir_join( args.feature_save_path, args.save_format, data_size) for data_type in ['train', 'eval2000_swbd', 'eval2000_ch']: print('---------- %s ----------' % data_type) if isfile(join(input_save_path, data_type, 'complete.txt')): print('Already exists.') else: if args.save_format == 'wav': ######################################## # Split WAV files per utterance ######################################## if data_type == 'train': wav_paths = path.wav(corpus='swbd') if data_size == '2000h': wav_paths += path.wav(corpus='fisher') else: wav_paths = path.wav(corpus=data_type) split_wav(wav_paths=wav_paths, speaker_dict=speaker_dict_dict[data_type], save_path=mkdir_join(input_save_path, data_type)) # NOTE: ex.) save_path: # swbd/feature/save_format/data_size/data_type/speaker/utt_name.npy elif args.save_format in ['numpy', 'htk']: if data_type == 'train': if args.tool == 'htk': audio_paths = path.htk(corpus='swbd') if data_size == '2000h': audio_paths += path.htk(corpus='fisher') else: audio_paths = path.wav(corpus='swbd') if data_size == '2000h': audio_paths += path.wav(corpus='fisher') is_training = True global_mean, global_std = None, None else: if args.tool == 'htk': audio_paths = path.htk(corpus=data_type) else: audio_paths = path.wav(corpus=data_type) is_training = False # Load statistics over train dataset global_mean = np.load( join(input_save_path, 'train/global_mean.npy')) global_std = np.load( join(input_save_path, 'train/global_std.npy')) read_audio(audio_paths=audio_paths, tool=args.tool, config=CONFIG, normalize=args.normalize, speaker_dict=speaker_dict_dict[data_type], is_training=is_training, save_path=mkdir_join(input_save_path, data_type), save_format=args.save_format, global_mean=global_mean, global_std=global_std) # NOTE: ex.) save_path: # swbd/feature/save_format/data_size/data_type/speaker/*.npy # Make a confirmation file to prove that dataset was saved # correctly with open(join(input_save_path, data_type, 'complete.txt'), 'w') as f: f.write('') ######################################## # dataset (csv) ######################################## print('\n=> Saving dataset files...') dataset_save_path = mkdir_join( args.dataset_save_path, args.save_format, data_size, data_type) print('---------- %s ----------' % data_type) df_char = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_char_capital = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_word_freq1 = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_word_freq5 = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_word_freq10 = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_word_freq15 = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) with open(join(input_save_path, data_type, 'frame_num.pickle'), 'rb') as f: frame_num_dict = pickle.load(f) utt_count = 0 df_char_list, df_char_capital_list = [], [] df_word_freq1_list, df_word_freq5_list = [], [] df_word_freq10_list, df_word_freq15_list = [], [] speaker_dict = speaker_dict_dict[data_type] for speaker, utt_dict in tqdm(speaker_dict.items()): for utt_index, utt_info in utt_dict.items(): if args.save_format == 'numpy': input_utt_save_path = join( input_save_path, data_type, speaker, speaker + '_' + utt_index + '.npy') elif args.save_format == 'htk': input_utt_save_path = join( input_save_path, data_type, speaker, speaker + '_' + utt_index + '.htk') elif args.save_format == 'wav': input_utt_save_path = path.utt2wav(utt_index) else: raise ValueError('save_format is numpy or htk or wav.') assert isfile(input_utt_save_path) frame_num = frame_num_dict[speaker + '_' + utt_index] char_indices, char_indices_capital, word_freq1_indices = utt_info[2:5] word_freq5_indices, word_freq10_indices, word_freq15_indices = utt_info[5:8] series_char = pd.Series( [frame_num, input_utt_save_path, char_indices], index=df_char.columns) series_char_capital = pd.Series( [frame_num, input_utt_save_path, char_indices_capital], index=df_char_capital.columns) series_word_freq1 = pd.Series( [frame_num, input_utt_save_path, word_freq1_indices], index=df_word_freq1.columns) series_word_freq5 = pd.Series( [frame_num, input_utt_save_path, word_freq5_indices], index=df_word_freq5.columns) series_word_freq10 = pd.Series( [frame_num, input_utt_save_path, word_freq10_indices], index=df_word_freq10.columns) series_word_freq15 = pd.Series( [frame_num, input_utt_save_path, word_freq15_indices], index=df_word_freq15.columns) df_char = df_char.append(series_char, ignore_index=True) df_char_capital = df_char_capital.append( series_char_capital, ignore_index=True) df_word_freq1 = df_word_freq1.append( series_word_freq1, ignore_index=True) df_word_freq5 = df_word_freq5.append( series_word_freq5, ignore_index=True) df_word_freq10 = df_word_freq10.append( series_word_freq10, ignore_index=True) df_word_freq15 = df_word_freq15.append( series_word_freq15, ignore_index=True) utt_count += 1 # Reset if utt_count == 10000: df_char_list.append(df_char) df_char_capital_list.append(df_char_capital) df_word_freq1_list.append(df_word_freq1) df_word_freq5_list.append(df_word_freq5) df_word_freq10_list.append(df_word_freq10) df_word_freq15_list.append(df_word_freq15) df_char = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_char_capital = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_word_freq1 = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_word_freq5 = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_word_freq10 = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_word_freq15 = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) utt_count = 0 # Last dataframe df_char_list.append(df_char) df_char_capital_list.append(df_char_capital) df_word_freq1_list.append(df_word_freq1) df_word_freq5_list.append(df_word_freq5) df_word_freq10_list.append(df_word_freq10) df_word_freq15_list.append(df_word_freq15) # Concatenate all dataframes df_char = df_char_list[0] df_char_capital = df_char_capital_list[0] df_word_freq1 = df_word_freq1_list[0] df_word_freq5 = df_word_freq5_list[0] df_word_freq10 = df_word_freq10_list[0] df_word_freq15 = df_word_freq15_list[0] for df_i in df_char_list[1:]: df_char = pd.concat([df_char, df_i], axis=0) for df_i in df_char_list[1:]: df_char_capital = pd.concat([df_char_capital, df_i], axis=0) for df_i in df_word_freq1_list[1:]: df_word_freq1 = pd.concat([df_word_freq1, df_i], axis=0) for df_i in df_word_freq5_list[1:]: df_word_freq5 = pd.concat([df_word_freq5, df_i], axis=0) for df_i in df_word_freq10_list[1:]: df_word_freq10 = pd.concat([df_word_freq10, df_i], axis=0) for df_i in df_word_freq15_list[1:]: df_word_freq15 = pd.concat([df_word_freq15, df_i], axis=0) df_char.to_csv(join(dataset_save_path, 'character.csv')) df_char_capital.to_csv( join(dataset_save_path, 'character_capital_divide.csv')) df_word_freq1.to_csv(join(dataset_save_path, 'word_freq1.csv')) df_word_freq5.to_csv(join(dataset_save_path, 'word_freq5.csv')) df_word_freq10.to_csv(join(dataset_save_path, 'word_freq10.csv')) df_word_freq15.to_csv(join(dataset_save_path, 'word_freq15.csv'))