def main(): for data_type in ['train', 'dev', 'test']: print('=' * 50) print(' ' * 20 + data_type + ' ' * 20) print('=' * 50) ######################################## # inputs ######################################## print('=> Processing input data...') if args.save_format in ['numpy', 'htk']: input_save_path = mkdir_join(args.feature_save_path, args.save_format) if isfile(join(input_save_path, data_type, 'complete.txt')): print('Already exists.') else: if args.tool == 'htk': audio_paths = path.htk(data_type=data_type) else: audio_paths = path.wav(data_type=data_type) if data_type != 'train': is_training = False # Load statistics over train dataset global_mean_male = np.load( join(input_save_path, 'train/global_mean_male.npy')) global_std_male = np.load( join(input_save_path, 'train/global_std_male.npy')) global_mean_female = np.load( join(input_save_path, 'train/global_mean_female.npy')) global_std_female = np.load( join(input_save_path, 'train/global_std_female.npy')) else: is_training = True global_mean_male, global_std_male, global_mean_female, global_std_female = None, None, None, None # Read htk or wav files, and save input data and frame num dict read_audio(audio_paths=audio_paths, tool=args.tool, config=CONFIG, normalize=args.normalize, is_training=is_training, save_path=mkdir_join(input_save_path, data_type), save_format=args.save_format, global_mean_male=global_mean_male, global_std_male=global_std_male, global_mean_female=global_mean_female, global_std_female=global_std_female) # NOTE: ex.) save_path: # timit/feature/save_format/data_type/*.npy # Make a confirmation file to prove that dataset was saved # correctly with open(join(input_save_path, data_type, 'complete.txt'), 'w') as f: f.write('') ######################################## # labels (character) ######################################## print('\n=> Processing transcripts (char)...') save_vocab_file = True if data_type == 'train' else False is_test = True if data_type == 'test' else False trans_dict = read_char(label_paths=path.trans(data_type=data_type), vocab_file_save_path=mkdir_join( './config', 'vocab_files'), save_vocab_file=save_vocab_file, is_test=is_test) ######################################## # dataset (character, csv) ######################################## print('\n=> Saving dataset files (char)...') dataset_save_path = mkdir_join(args.dataset_save_path, args.save_format, data_type) df_char = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_char_capital = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) for utt_name, [char_indices, char_indices_capital] in tqdm(trans_dict.items()): if args.save_format == 'numpy': speaker = utt_name.split('_')[0] input_utt_save_path = join(input_save_path, data_type, speaker, utt_name + '.npy') assert isfile(input_utt_save_path) input_utt = np.load(input_utt_save_path) elif args.save_format == 'htk': speaker = utt_name.split('_')[0] input_utt_save_path = join(input_save_path, data_type, speaker, utt_name + '.htk') assert isfile(input_utt_save_path) input_utt, _, _ = read(input_utt_save_path) elif args.save_format == 'wav': input_utt_save_path = path.utt2wav(utt_name) assert isfile(input_utt_save_path) input_utt = w2f_psf(input_utt_save_path, feature_type=CONFIG['feature_type'], feature_dim=CONFIG['channels'], use_energy=CONFIG['energy'], use_delta1=CONFIG['delta'], use_delta2=CONFIG['deltadelta'], window=CONFIG['window'], slide=CONFIG['slide']) else: raise ValueError('save_format is numpy or htk or wav.') frame_num = input_utt.shape[0] del input_utt series_char = pd.Series( [frame_num, input_utt_save_path, char_indices], index=df_char.columns) series_char_capital = pd.Series( [frame_num, input_utt_save_path, char_indices_capital], index=df_char_capital.columns) df_char = df_char.append(series_char, ignore_index=True) df_char_capital = df_char_capital.append(series_char_capital, ignore_index=True) df_char.to_csv(join(dataset_save_path, 'character.csv')) df_char_capital.to_csv( join(dataset_save_path, 'character_capital_divide.csv')) ######################################## # labels (phone) ######################################## print('\n=> Processing transcripts (phone)...') trans_dict = read_phone(label_paths=path.phone(data_type=data_type), vocab_file_save_path=mkdir_join( './config', 'vocab_files'), save_vocab_file=save_vocab_file, is_test=is_test) ######################################## # dataset (phone, csv) ######################################## print('\n=> Saving dataset files (phone)...') df_phone61 = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_phone48 = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_phone39 = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) for utt_name, [phone61_indices, phone48_indices, phone39_indices] in tqdm(trans_dict.items()): if args.save_format == 'numpy': speaker = utt_name.split('_')[0] input_utt_save_path = join(input_save_path, data_type, speaker, utt_name + '.npy') assert isfile(input_utt_save_path) input_utt = np.load(input_utt_save_path) elif args.save_format == 'htk': speaker = utt_name.split('_')[0] input_utt_save_path = join(input_save_path, data_type, speaker, utt_name + '.htk') assert isfile(input_utt_save_path) input_utt, _, _ = read(input_utt_save_path) elif args.save_format == 'wav': input_utt_save_path = path.utt2wav(utt_name) assert isfile(input_utt_save_path) input_utt = w2f_psf(input_utt_save_path, feature_type=CONFIG['feature_type'], feature_dim=CONFIG['channels'], use_energy=CONFIG['energy'], use_delta1=CONFIG['delta'], use_delta2=CONFIG['deltadelta'], window=CONFIG['window'], slide=CONFIG['slide']) else: raise ValueError('save_format is numpy or htk or wav.') frame_num = input_utt.shape[0] del input_utt series_phone61 = pd.Series( [frame_num, input_utt_save_path, phone61_indices], index=df_phone61.columns) series_phone48 = pd.Series( [frame_num, input_utt_save_path, phone48_indices], index=df_phone48.columns) series_phone39 = pd.Series( [frame_num, input_utt_save_path, phone39_indices], index=df_phone39.columns) df_phone61 = df_phone61.append(series_phone61, ignore_index=True) df_phone48 = df_phone48.append(series_phone48, ignore_index=True) df_phone39 = df_phone39.append(series_phone39, ignore_index=True) df_phone61.to_csv(join(dataset_save_path, 'phone61.csv')) df_phone48.to_csv(join(dataset_save_path, 'phone48.csv')) df_phone39.to_csv(join(dataset_save_path, 'phone39.csv'))
def read_audio(audio_paths, tool, config, normalize, is_training, speaker_gender_dict, save_path=None, save_format=None, global_mean_male=None, global_mean_female=None, global_std_male=None, global_std_female=None, dtype=np.float32): """Read audio files. Args: audio_paths (list): paths to HTK or WAV files tool (string): the tool to extract features, htk or librosa or python_speech_features config (dict): a configuration for feature extraction normalize (string): no => normalization will be not conducted global => normalize input features by global mean & std over the training set per gender speaker => normalize input features by mean & std per speaker utterance => normalize input features by mean & std per utterancet data by mean & std per utterance is_training (bool): Set True if save as training set speaker_gender_dict (dict): A dictionary of speakers' gender information key (string) => speaker value (string) => F or M save_path (string): path to save npy files save_format (string, optional): numpy as htk global_mean_male (np.ndarray, optional): global mean of male over the training set global_std_male (np.ndarray, optional): global standard deviation of male over the training set global_mean_female (np.ndarray, optional): global mean of female over the training set global_std_female (np.ndarray, optional): global standard deviation of female over the training set dtype (optional): the type of data, default is np.float32 Returns: global_mean_male (np.ndarray): global mean of male over the training set global_std_male (np.ndarray): global standard deviation of male over the training set global_mean_female (np.ndarray): global mean of female over the training set global_std_female (np.ndarray): global standard deviation of female over the training set frame_num_dict (dict): key => utterance name value => the number of frames """ if not is_training: if global_mean_male is None or global_std_male is None: raise ValueError('Set mean & std computed in the training set.') if normalize not in ['global', 'speaker', 'utterance', 'no']: raise ValueError( 'normalize must be "utterance" or "speaker" or "global" or "no".') if tool not in ['htk', 'python_speech_features', 'librosa']: raise TypeError('tool must be "htk" or "python_speech_features"' + ' or "librosa".') audio_path_dict = {} audio_path_list_male, audio_path_list_female = [], [] total_frame_num_male, total_frame_num_female = 0, 0 total_frame_num_dict = {} speaker_mean_dict, speaker_std_dict = {}, {} # Loop 1: Divide all audio paths into speakers print('=====> Reading audio files...') for i, audio_path in enumerate(tqdm(audio_paths)): # ex.) audio_path: speaker-book-utt_index.*** speaker, book, utt_index = basename(audio_path).split('.')[0].split( '-') if speaker not in audio_path_dict.keys(): audio_path_dict[speaker] = [] audio_path_dict[speaker].append(audio_path) if is_training: # Read each audio file if tool == 'htk': input_utt, sampPeriod, parmKind = read(audio_path) elif tool == 'python_speech_features': input_utt = w2f_psf(audio_path, feature_type=config['feature_type'], feature_dim=config['channels'], use_energy=config['energy'], use_delta1=config['delta'], use_delta2=config['deltadelta'], window=config['window'], slide=config['slide']) elif tool == 'librosa': input_utt = w2f_librosa(audio_path, feature_type=config['feature_type'], feature_dim=config['channels'], use_energy=config['energy'], use_delta1=config['delta'], use_delta2=config['deltadelta'], window=config['window'], slide=config['slide']) input_utt_sum = np.sum(input_utt, axis=0) if i == 0: # Initialize global statistics feature_dim = input_utt.shape[1] global_mean_male = np.zeros((feature_dim, ), dtype=dtype) global_mean_female = np.zeros((feature_dim, ), dtype=dtype) global_std_male = np.zeros((feature_dim, ), dtype=dtype) global_std_female = np.zeros((feature_dim, ), dtype=dtype) # For computing global mean if speaker_gender_dict[speaker] == 'M': audio_path_list_male.append(input_utt) global_mean_male += input_utt_sum total_frame_num_male += input_utt.shape[0] elif speaker_gender_dict[speaker] == 'F': audio_path_list_female.append(input_utt) global_mean_female += input_utt_sum total_frame_num_female += input_utt.shape[0] else: raise ValueError('gender is M or F.') # For computing speaker mean if normalize == 'speaker': if speaker not in total_frame_num_dict.keys(): total_frame_num_dict[speaker] = 0 # Initialize speaker statistics speaker_mean_dict[speaker] = np.zeros((feature_dim, ), dtype=dtype) speaker_std_dict[speaker] = np.zeros((feature_dim, ), dtype=dtype) speaker_mean_dict[speaker] += input_utt_sum total_frame_num_dict[speaker] += input_utt.shape[0] # Loop 2: Computing global mean and sttdev if is_training and normalize != 'no': print('=====> Computing global mean & stddev...') # Compute global mean per gender global_mean_male /= total_frame_num_male global_mean_female /= total_frame_num_female for speaker, audio_paths_speaker in tqdm(audio_path_dict.items()): if normalize == 'speaker': # Compute speaker mean speaker_mean_dict[speaker] /= total_frame_num_dict[speaker] for audio_path in audio_paths_speaker: speaker, book, utt_index = basename(audio_path).split( '.')[0].split('-') # Read each audio file if tool == 'htk': input_utt, sampPeriod, parmKind = read(audio_path) elif tool == 'python_speech_features': input_utt = w2f_psf(audio_path, feature_type=config['feature_type'], feature_dim=config['channels'], use_energy=config['energy'], use_delta1=config['delta'], use_delta2=config['deltadelta'], window=config['window'], slide=config['slide']) elif tool == 'librosa': input_utt = w2f_librosa( audio_path, feature_type=config['feature_type'], feature_dim=config['channels'], use_energy=config['energy'], use_delta1=config['delta'], use_delta2=config['deltadelta'], window=config['window'], slide=config['slide']) # For computing global stddev if speaker_gender_dict[speaker] == 'M': global_std_male += np.sum(np.abs(input_utt - global_mean_male)**2, axis=0) elif speaker_gender_dict[speaker] == 'F': global_std_female += np.sum(np.abs(input_utt - global_mean_female)**2, axis=0) else: raise ValueError('gender is M or F.') if normalize == 'speaker': # For computing speaker stddev speaker_std_dict[speaker] += np.sum( np.abs(input_utt - speaker_mean_dict[speaker])**2, axis=0) if normalize == 'speaker': # Compute speaker stddev speaker_std_dict[speaker] = np.sqrt( speaker_std_dict[speaker] / (total_frame_num_dict[speaker] - 1)) # Compute global stddev per gender global_std_male = np.sqrt(global_std_male / (total_frame_num_male - 1)) global_std_female = np.sqrt(global_std_female / (total_frame_num_female - 1)) if save_path is not None: # Save global mean & std per gender np.save(join(save_path, 'global_mean_male.npy'), global_mean_male) np.save(join(save_path, 'global_mean_female.npy'), global_mean_female) np.save(join(save_path, 'global_std_male.npy'), global_std_male) np.save(join(save_path, 'global_std_female.npy'), global_std_female) # Loop 3: Normalization and Saving print('=====> Normalization...') frame_num_dict = {} for speaker, audio_paths_speaker in tqdm(audio_path_dict.items()): for audio_path in audio_paths_speaker: speaker, book, utt_index = basename(audio_path).split( '.')[0].split('-') # Read each audio file if tool == 'htk': input_utt, sampPeriod, parmKind = read(audio_path) elif tool == 'python_speech_features': input_utt = w2f_psf(audio_path, feature_type=config['feature_type'], feature_dim=config['channels'], use_energy=config['energy'], use_delta1=config['delta'], use_delta2=config['deltadelta'], window=config['window'], slide=config['slide']) elif tool == 'librosa': input_utt = w2f_librosa(audio_path, feature_type=config['feature_type'], feature_dim=config['channels'], use_energy=config['energy'], use_delta1=config['delta'], use_delta2=config['deltadelta'], window=config['window'], slide=config['slide']) if normalize == 'no': pass elif normalize == 'global' or not is_training: # Normalize by mean & std over the training set per gender if speaker_gender_dict[speaker] == 'M': input_utt -= global_mean_male input_utt /= global_std_male elif speaker_gender_dict[speaker] == 'F': input_utt -= global_mean_female input_utt /= global_std_female else: raise ValueError('gender is M or F.') elif normalize == 'speaker': # Normalize by mean & std per speaker input_utt -= speaker_mean_dict[speaker] input_utt /= speaker_std_dict[speaker] elif normalize == 'utterance': # Normalize by mean & std per utterance utt_mean = np.mean(input_utt, axis=0, dtype=dtype) utt_std = np.std(input_utt, axis=0, dtype=dtype) input_utt = (input_utt - utt_mean) / utt_std else: raise ValueError frame_num_dict[speaker + '_' + utt_index] = input_utt.shape[0] if save_path is not None: # Save input features input_name = basename(audio_path).split('.')[0] if save_format == 'numpy': input_data_save_path = mkdir_join(save_path, speaker, input_name + '.npy') np.save(input_data_save_path, input_utt) elif save_format == 'htk': write(input_utt, htk_path=mkdir_join(save_path, speaker, input_name + '.htk'), sampPeriod=sampPeriod, parmKind=parmKind) else: raise ValueError('save_format is numpy or htk.') if save_path is not None: # Save the frame number dictionary with open(join(save_path, 'frame_num.pickle'), 'wb') as f: pickle.dump(frame_num_dict, f) return (global_mean_male, global_mean_female, global_std_male, global_std_female, frame_num_dict)
def main(data_size): for data_type in [ 'train', 'dev_clean', 'dev_other', 'test_clean', 'test_other' ]: print('=' * 50) print(' ' * 20 + data_type + ' (' + data_size + ')' + ' ' * 20) print('=' * 50) ######################################## # inputs ######################################## print('=> Processing input data...') if args.save_format in ['numpy', 'htk']: input_save_path = mkdir_join(args.feature_save_path, args.save_format, data_size) if isfile(join(input_save_path, data_type, 'complete.txt')): print('Already exists.') else: if data_type == 'train': if args.tool == 'htk': audio_paths = path.htk(data_type='train' + data_size) else: audio_paths = path.wav(data_type='train' + data_size) is_training = True global_mean_male, global_std_male, global_mean_female, global_std_female = None, None, None, None else: if args.tool == 'htk': audio_paths = path.htk(data_type=data_type) else: audio_paths = path.wav(data_type=data_type) is_training = False # Load statistics over train dataset global_mean_male = np.load( join(input_save_path, 'train/global_mean_male.npy')) global_std_male = np.load( join(input_save_path, 'train/global_std_male.npy')) global_mean_female = np.load( join(input_save_path, 'train/global_mean_female.npy')) global_std_female = np.load( join(input_save_path, 'train/global_std_female.npy')) read_audio(audio_paths=audio_paths, tool=args.tool, config=CONFIG, normalize=args.normalize, speaker_gender_dict=path.speaker_gender_dict, is_training=is_training, save_path=mkdir_join(input_save_path, data_type), save_format=args.save_format, global_mean_male=global_mean_male, global_mean_female=global_mean_female, global_std_male=global_std_male, global_std_female=global_std_female) # NOTE: ex.) save_path: # librispeech/feature/save_format/data_size/data_type/speaker/*.npy # Make a confirmation file to prove that dataset was saved # correctly with open(join(input_save_path, data_type, 'complete.txt'), 'w') as f: f.write('') ######################################## # labels ######################################## print('\n=> Processing transcripts...') if data_type == 'train': label_paths = path.trans(data_type='train' + data_size) else: label_paths = path.trans(data_type=data_type) save_vocab_file = True if data_type == 'train' else False is_test = True if 'test' in data_type else False speaker_dict = read_trans(label_paths=label_paths, data_size=data_size, vocab_file_save_path=mkdir_join( './config', 'vocab_files'), save_vocab_file=save_vocab_file, is_test=is_test, data_type=data_type) ######################################## # dataset (csv) ######################################## print('\n=> Saving dataset files...') dataset_save_path = mkdir_join(args.dataset_save_path, args.save_format, data_size, data_type) df_char = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_char_capital = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_word_freq1 = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_word_freq5 = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_word_freq10 = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_word_freq15 = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) utt_count = 0 df_char_list, df_char_capital_list = [], [] df_word_freq1_list, df_word_freq5_list = [], [] df_word_freq10_list, df_word_freq15_list = [], [] for speaker, utt_dict in tqdm(speaker_dict.items()): for utt_name, indices_list in utt_dict.items(): if args.save_format == 'numpy': input_utt_save_path = join(input_save_path, data_type, speaker, utt_name + '.npy') assert isfile(input_utt_save_path) input_utt = np.load(input_utt_save_path) elif args.save_format == 'htk': input_utt_save_path = join(input_save_path, data_type, speaker, utt_name + '.htk') assert isfile(input_utt_save_path) input_utt, _, _ = read(input_utt_save_path) elif args.save_format == 'wav': input_utt_save_path = path.utt2wav(utt_name) assert isfile(input_utt_save_path) input_utt = w2f_psf(input_utt_save_path, feature_type=CONFIG['feature_type'], feature_dim=CONFIG['channels'], use_energy=CONFIG['energy'], use_delta1=CONFIG['delta'], use_delta2=CONFIG['deltadelta'], window=CONFIG['window'], slide=CONFIG['slide']) else: raise ValueError('save_format is numpy or htk or wav.') frame_num = input_utt.shape[0] del input_utt char_indices, char_indices_capital, word_freq1_indices = indices_list[: 3] word_freq5_indices, word_freq10_indices, word_freq15_indices = indices_list[ 3:6] series_char = pd.Series( [frame_num, input_utt_save_path, char_indices], index=df_char.columns) series_char_capital = pd.Series( [frame_num, input_utt_save_path, char_indices_capital], index=df_char_capital.columns) series_word_freq1 = pd.Series( [frame_num, input_utt_save_path, word_freq1_indices], index=df_word_freq1.columns) series_word_freq5 = pd.Series( [frame_num, input_utt_save_path, word_freq5_indices], index=df_word_freq5.columns) series_word_freq10 = pd.Series( [frame_num, input_utt_save_path, word_freq10_indices], index=df_word_freq10.columns) series_word_freq15 = pd.Series( [frame_num, input_utt_save_path, word_freq15_indices], index=df_word_freq15.columns) df_char = df_char.append(series_char, ignore_index=True) df_char_capital = df_char_capital.append(series_char_capital, ignore_index=True) df_word_freq1 = df_word_freq1.append(series_word_freq1, ignore_index=True) df_word_freq5 = df_word_freq5.append(series_word_freq5, ignore_index=True) df_word_freq10 = df_word_freq10.append(series_word_freq10, ignore_index=True) df_word_freq15 = df_word_freq15.append(series_word_freq15, ignore_index=True) utt_count += 1 # Reset if utt_count == 50000: df_char_list.append(df_char) df_char_capital_list.append(df_char_capital) df_word_freq1_list.append(df_word_freq1) df_word_freq5_list.append(df_word_freq5) df_word_freq10_list.append(df_word_freq10) df_word_freq15_list.append(df_word_freq15) df_char = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_char_capital = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_word_freq1 = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_word_freq5 = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_word_freq10 = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_word_freq15 = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) utt_count = 0 # Last dataframe df_char_list.append(df_char) df_char_capital_list.append(df_char_capital) df_word_freq1_list.append(df_word_freq1) df_word_freq5_list.append(df_word_freq5) df_word_freq10_list.append(df_word_freq10) df_word_freq15_list.append(df_word_freq15) # Concatenate all dataframes df_char = df_char_list[0] df_char_capital = df_char_capital_list[0] df_word_freq1 = df_word_freq1_list[0] df_word_freq5 = df_word_freq5_list[0] df_word_freq10 = df_word_freq10_list[0] df_word_freq15 = df_word_freq15_list[0] for df_i in df_char_list[1:]: df_char = pd.concat([df_char, df_i], axis=0) for df_i in df_char_list[1:]: df_char_capital = pd.concat([df_char_capital, df_i], axis=0) for df_i in df_word_freq1_list[1:]: df_word_freq1 = pd.concat([df_word_freq1, df_i], axis=0) for df_i in df_word_freq5_list[1:]: df_word_freq5 = pd.concat([df_word_freq5, df_i], axis=0) for df_i in df_word_freq10_list[1:]: df_word_freq10 = pd.concat([df_word_freq10, df_i], axis=0) for df_i in df_word_freq15_list[1:]: df_word_freq15 = pd.concat([df_word_freq15, df_i], axis=0) df_char.to_csv(join(dataset_save_path, 'character.csv')) df_char_capital.to_csv( join(dataset_save_path, 'character_capital_divide.csv')) df_word_freq1.to_csv(join(dataset_save_path, 'word_freq1.csv')) df_word_freq5.to_csv(join(dataset_save_path, 'word_freq5.csv')) df_word_freq10.to_csv(join(dataset_save_path, 'word_freq10.csv')) df_word_freq15.to_csv(join(dataset_save_path, 'word_freq15.csv'))
def read_audio(audio_paths, speaker_dict, tool, config, normalize, is_training, save_path=None, save_format='numpy', global_mean_male=None, global_mean_female=None, global_std_male=None, global_std_female=None, dtype=np.float32): """Read HTK or WAV files. Args: audio_paths (list): paths to HTK or WAV files speaker_dict (dict): dictionary of speakers key => speaker value => dictionary of utterance information of each speaker key => utterance index value => [start_frame, end_frame, trans_kana, trans_kanji] tool (string): the tool to extract features, htk or librosa or python_speech_features config (dict): a configuration for feature extraction normalize (string): no => normalization will be not conducted global => normalize input features by global mean & std over the training set per gender speaker => normalize input features by mean & std per speaker utterance => normalize input features by mean & std per utterancet data by mean & std per utterance is_training (bool, optional): training or not save_path (string): path to save npy files save_format (string, optional): numpy or htk global_mean_male (np.ndarray, optional): global mean of male over the training set global_std_male (np.ndarray, optional): global standard deviation of male over the training set global_mean_female (np.ndarray, optional): global mean of female over the training set global_std_female (np.ndarray, optional): global standard deviation of female over the training set dtype (optional): the type of data, default is np.float32 Returns: global_mean_male (np.ndarray): global mean of male over the training set global_std_male (np.ndarray): global standard deviation of male over the training set global_mean_female (np.ndarray): global mean of female over the training set global_std_female (np.ndarray): global standard deviation of female over the training set frame_num_dict (dict): key => utterance name value => the number of frames """ if not is_training: if global_mean_male is None or global_mean_female is None: raise ValueError('Set mean & std computed in the training set.') if normalize not in ['global', 'speaker', 'utterance', 'no']: raise ValueError( 'normalize must be "utterance" or "speaker" or "global" or "no".') if tool not in ['htk', 'python_speech_features', 'librosa']: raise TypeError('tool must be "htk" or "python_speech_features"' + ' or "librosa".') audio_path_list_male, audio_path_list_female = [], [] total_frame_num_male, total_frame_num_female = 0, 0 total_frame_num_dict = {} speaker_mean_dict = {} # NOTE: 講演ごとに異なるspeakerとみなす # Loop 1: Computing global mean and statistics if is_training and normalize != 'no': print('=====> Reading audio files...') for i, audio_path in enumerate(tqdm(audio_paths)): speaker = basename(audio_path).split('.')[0] # Divide each audio file into utterances _, input_utt_sum, speaker_mean, _, total_frame_num_speaker = segment( audio_path, speaker, speaker_dict[speaker], is_training=True, sil_duration=0, tool=tool, config=config) if i == 0: # Initialize global statistics feature_dim = input_utt_sum.shape[0] global_mean_male = np.zeros((feature_dim, ), dtype=dtype) global_mean_female = np.zeros((feature_dim, ), dtype=dtype) global_std_male = np.zeros((feature_dim, ), dtype=dtype) global_std_female = np.zeros((feature_dim, ), dtype=dtype) # For computing global mean if speaker[3] == 'M': audio_path_list_male.append(audio_path) global_mean_male += input_utt_sum total_frame_num_male += total_frame_num_speaker elif speaker[3] == 'F': audio_path_list_female.append(audio_path) global_mean_female += input_utt_sum total_frame_num_female += total_frame_num_speaker else: raise ValueError # For computing speaker stddev if normalize == 'speaker': speaker_mean_dict[speaker] = speaker_mean total_frame_num_dict[speaker] = total_frame_num_speaker # NOTE: speaker mean is already computed print('=====> Computing global mean & stddev...') # Compute global mean per gender global_mean_male /= total_frame_num_male global_mean_female /= total_frame_num_female for audio_path in tqdm(audio_paths): speaker = basename(audio_path).split('.')[0] # Divide each audio into utterances input_data_dict_speaker, _, _, _, _ = segment( audio_path, speaker, speaker_dict[speaker], is_training=True, sil_duration=0, tool=tool, config=config) # For computing global stddev if speaker[3] == 'M': for input_utt in input_data_dict_speaker.values(): global_std_male += np.sum(np.abs(input_utt - global_mean_male)**2, axis=0) elif speaker[3] == 'F': for input_utt in input_data_dict_speaker.values(): global_std_female += np.sum(np.abs(input_utt - global_mean_female)**2, axis=0) else: raise ValueError # Compute global stddev per gender global_std_male = np.sqrt(global_std_male / (total_frame_num_male - 1)) global_std_female = np.sqrt(global_std_female / (total_frame_num_female - 1)) if save_path is not None: # Save global mean & std per gender np.save(join(save_path, 'global_mean_male.npy'), global_mean_male) np.save(join(save_path, 'global_mean_female.npy'), global_mean_female) np.save(join(save_path, 'global_std_male.npy'), global_std_male) np.save(join(save_path, 'global_std_female.npy'), global_std_female) # Loop 2: Normalization and Saving print('=====> Normalization...') frame_num_dict = {} sampPeriod, parmKind = None, None for audio_path in tqdm(audio_paths): speaker = basename(audio_path).split('.')[0] if normalize == 'speaker' and is_training: speaker_mean = speaker_mean_dict[speaker] else: speaker_mean = None # Divide each audio into utterances input_data_dict_speaker, _, speaker_mean, speaker_std, _ = segment( audio_path, speaker, speaker_dict[speaker], is_training=is_training, sil_duration=0, tool=tool, config=config, mean=speaker_mean) # for compute speaker sttdev # NOTE: input_data_dict_speaker have been not normalized yet for utt_index, input_utt in input_data_dict_speaker.items(): if normalize == 'no': pass elif normalize == 'global' or not is_training: # Normalize by mean & std over the training set per gender if speaker[3] == 'M': input_utt -= global_mean_male input_utt /= global_std_male elif speaker[3] == 'F': input_utt -= global_mean_female input_utt /= global_std_female else: raise ValueError elif normalize == 'speaker': # Normalize by mean & std per speaker input_utt = (input_utt - speaker_mean) / speaker_std elif normalize == 'utterance': # Normalize by mean & std per utterance utt_mean = np.mean(input_utt, axis=0, dtype=dtype) utt_std = np.std(input_utt, axis=0, dtype=dtype) input_utt = (input_utt - utt_mean) / utt_std else: raise ValueError frame_num_dict[speaker + '_' + utt_index] = input_utt.shape[0] if save_path is not None: # Save input features if save_format == 'numpy': input_data_save_path = mkdir_join( save_path, speaker, speaker + '_' + utt_index + '.npy') np.save(input_data_save_path, input_utt) elif save_format == 'htk': if sampPeriod is None: _, sampPeriod, parmKind = read(audio_path) write(input_utt, htk_path=mkdir_join( save_path, speaker, speaker + '_' + utt_index + '.htk'), sampPeriod=sampPeriod, parmKind=parmKind) else: raise ValueError('save_format is numpy or htk.') if save_path is not None: # Save the frame number dictionary with open(join(save_path, 'frame_num.pickle'), 'wb') as f: pickle.dump(frame_num_dict, f) return (global_mean_male, global_mean_female, global_std_male, global_std_female, frame_num_dict)
def read_audio(audio_paths, tool, config, normalize, is_training, save_path=None, save_format=None, global_mean_male=None, global_std_male=None, global_mean_female=None, global_std_female=None, dtype=np.float32): """Read audio files. Args: audio_paths (list): paths to audio files tool (string): the tool to extract features, htk or librosa or python_speech_features config (dict): a configuration for feature extraction normalize (string): no => normalization will be not conducted global => normalize input features by global mean & std over the training set per gender speaker => normalize input features by mean & std per speaker utterance => normalize input features by mean & std per utterancet data by mean & std per utterance is_training (bool, optional): Set True when proccessing the training set save_path (string): path to save npy files save_format (string, optional): numpy as htk global_mean_male (np.ndarray, optional): global mean of male over the training set global_std_male (np.ndarray, optional): global standard deviation of male over the training set global_mean_female (np.ndarray, optional): global mean of female over the training set global_std_female (np.ndarray, optional): global standard deviation of female over the training set dtype (optional): the type of data, default is np.float32 Returns: global_mean_male (np.ndarray): global mean of male over the training set global_std_male (np.ndarray): global standard deviation of male over the training set global_mean_female (np.ndarray): global mean of female over the training set global_std_female (np.ndarray): global standard deviation of female over the training set frame_num_dict (dict): key => utterance name value => the number of frames """ if not is_training: if global_mean_male is None or global_std_male is None: raise ValueError( 'Set global mean & std computed over the training set.') if normalize not in ['global', 'speaker', 'utterance', 'no']: raise ValueError( 'normalize must be "utterance" or "speaker" or "global" or "no".') # Read each audio file print('=====> Reading audio files...') audio_paths_male, audio_paths_female = [], [] input_data_list_male, input_data_list_female = [], [] total_frame_num_male, total_frame_num_female = 0, 0 total_frame_num_dict = {} speaker_mean_dict, speaker_std_dict = {}, {} for audio_path in tqdm(audio_paths): speaker = audio_path.split('/')[-2] gender = speaker[0] # f (female) or m (male) utt_index = basename(audio_path).split('.')[0] if tool == 'htk': input_utt, sampPeriod, parmKind = read(audio_path) # NOTE: audio_path is a htk file path in this case elif tool == 'python_speech_features': input_utt = w2f_psf(audio_path, feature_type=config['feature_type'], feature_dim=config['channels'], use_energy=config['energy'], use_delta1=config['delta'], use_delta2=config['deltadelta'], window=config['window'], slide=config['slide']) elif tool == 'librosa': input_utt = w2f_librosa(audio_path, feature_type=config['feature_type'], feature_dim=config['channels'], use_energy=config['energy'], use_delta1=config['delta'], use_delta2=config['deltadelta'], window=config['window'], slide=config['slide']) # for debug # print(input_utt.shape) if gender == 'm': input_data_list_male.append(input_utt) audio_paths_male.append(audio_path) elif gender == 'f': input_data_list_female.append(input_utt) audio_paths_female.append(audio_path) else: raise ValueError('gender is m or f.') if is_training: speaker = audio_path.split('/')[-2] gender = speaker[0] frame_num_utt, feat_dim = input_utt.shape if gender == 'm': total_frame_num_male += frame_num_utt elif gender == 'f': total_frame_num_female += frame_num_utt else: raise ValueError('gender is m or f.') if normalize == 'speaker': # Initialization if speaker not in total_frame_num_dict.keys(): total_frame_num_dict[speaker] = 0 speaker_mean_dict[speaker] = np.zeros((feat_dim, ), dtype=dtype) speaker_std_dict[speaker] = np.zeros((feat_dim, ), dtype=dtype) total_frame_num_dict[speaker] += frame_num_utt speaker_mean_dict[speaker] += np.sum(input_utt, axis=0) # NOTE: Load all data in advance because TIMIT is a small dataset. if is_training and normalize != 'no': # Compute speaker mean if normalize == 'speaker': for speaker in speaker_mean_dict.keys(): speaker_mean_dict[speaker] /= total_frame_num_dict[speaker] # Compute global mean & std per gender print('=====> Computing global mean & std over the training set...') frame_offset = 0 feat_dim = input_data_list_male[0].shape[1] train_data_male = np.empty((total_frame_num_male, feat_dim)) train_data_female = np.empty((total_frame_num_female, feat_dim)) # male for input_utt, audio_path in zip(tqdm(input_data_list_male), audio_paths_male): speaker = audio_path.split('/')[-2] frame_num_utt = input_utt.shape[0] train_data_male[frame_offset:frame_offset + frame_num_utt] = input_utt frame_offset += frame_num_utt if normalize == 'speaker': speaker_std_dict[speaker] += np.sum( np.abs(input_utt - speaker_mean_dict[speaker])**2, axis=0) # female frame_offset = 0 for input_utt, audio_path in zip(tqdm(input_data_list_female), audio_paths_female): speaker = audio_path.split('/')[-2] frame_num_utt = input_utt.shape[0] train_data_female[frame_offset:frame_offset + frame_num_utt] = input_utt frame_offset += frame_num_utt if normalize == 'speaker': speaker_std_dict[speaker] += np.sum( np.abs(input_utt - speaker_mean_dict[speaker])**2, axis=0) # Compute speaker std if normalize == 'speaker': for speaker in speaker_std_dict.keys(): speaker_std_dict[speaker] = np.sqrt( speaker_std_dict[speaker] / (total_frame_num_dict[speaker] - 1)) global_mean_male = np.mean(train_data_male, axis=0) global_std_male = np.std(train_data_male, axis=0) global_mean_female = np.mean(train_data_female, axis=0) global_std_female = np.std(train_data_female, axis=0) if save_path is not None: # Save global mean & std np.save(join(save_path, 'global_mean_male.npy'), global_mean_male) np.save(join(save_path, 'global_std_male.npy'), global_std_male) np.save(join(save_path, 'global_mean_female.npy'), global_mean_female) np.save(join(save_path, 'global_std_female.npy'), global_std_female) # Save input features as npy files print('=====> Normalization...') frame_num_dict = {} for input_utt, audio_path in zip( tqdm(input_data_list_male + input_data_list_female), audio_paths_male + audio_paths_female): speaker = audio_path.split('/')[-2] utt_index = basename(audio_path).split('.')[0] gender = speaker[0] if normalize == 'no': pass elif normalize == 'global' or not is_training: # Normalize by global mean & std over the training set if gender == 'm': input_utt -= global_mean_male input_utt /= global_std_male elif gender == 'f': input_utt -= global_mean_female input_utt /= global_std_female else: raise ValueError('gender is m or f.') elif normalize == 'speaker': # Normalize by mean & std per speaker input_utt -= speaker_mean_dict[speaker] input_utt /= speaker_std_dict[speaker] elif normalize == 'utterance': # Normalize by mean & std per utterance utt_mean = np.mean(input_utt, axis=0, dtype=dtype) utt_std = np.std(input_utt, axis=0, dtype=dtype) input_utt = (input_utt - utt_mean) / utt_std else: raise ValueError frame_num_dict[speaker + '_' + utt_index] = input_utt.shape[0] if save_path is not None: # Save input features if save_format == 'numpy': np.save( mkdir_join(save_path, speaker, speaker + '_' + utt_index + '.npy'), input_utt) elif save_format == 'htk': write(input_utt, htk_path=mkdir_join(save_path, speaker, speaker + '_' + utt_index + '.htk'), sampPeriod=sampPeriod, parmKind=parmKind) else: raise ValueError('save_format is numpy or htk.') if save_path is not None: # Save the frame number dictionary with open(join(save_path, 'frame_num.pickle'), 'wb') as f: pickle.dump(frame_num_dict, f) return (global_mean_male, global_std_male, global_mean_female, global_std_female, frame_num_dict)
def main(data_size): speaker_dict_dict = {} # dict of speaker_dict for data_type in ['train', 'eval1', 'eval2', 'eval3']: print('=' * 50) print(' ' * 20 + data_type + ' (' + data_size + ')' + ' ' * 20) print('=' * 50) ######################################## # labels ######################################## if data_type == 'train': label_paths = path.trans(data_type='train_' + data_size) else: label_paths = path.trans(data_type=data_type) save_vocab_file = True if data_type == 'train' else False is_test = True if 'eval' in data_type else False print('=> Processing transcripts...') speaker_dict_dict[data_type] = read_sdb( label_paths=label_paths, data_size=data_size, vocab_file_save_path=mkdir_join('./config', 'vocab_files'), save_vocab_file=save_vocab_file, is_test=is_test, data_type=data_type) ######################################## # inputs ######################################## print('\n=> Processing input data...') input_save_path = mkdir_join(args.feature_save_path, args.save_format, data_size) if isfile(join(input_save_path, data_type, 'complete.txt')): print('Already exists.') else: if args.save_format == 'wav': ######################################## # Split WAV files per utterance ######################################## if data_type == 'train': wav_paths = path.wav(corpus='train' + data_size) else: wav_paths = path.wav(corpus=data_type) split_wav(wav_paths=wav_paths, speaker_dict=speaker_dict_dict[data_type], save_path=mkdir_join(input_save_path, data_type)) # NOTE: ex.) save_path: # csj/feature/save_format/data_size/data_type/speaker/utt_name.npy elif args.save_format in ['numpy', 'htk']: if data_type == 'train': if args.tool == 'htk': audio_paths = path.htk(data_type='train_' + data_size) else: audio_paths = path.wav(data_type='train_' + data_size) is_training = True global_mean_male, global_std_male, global_mean_female, global_std_female = None, None, None, None else: if args.tool == 'htk': audio_paths = path.htk(data_type=data_type) else: audio_paths = path.wav(data_type=data_type) is_training = False # Load statistics over train dataset global_mean_male = np.load( join(input_save_path, 'train/global_mean_male.npy')) global_std_male = np.load( join(input_save_path, 'train/global_std_male.npy')) global_mean_female = np.load( join(input_save_path, 'train/global_mean_female.npy')) global_std_female = np.load( join(input_save_path, 'train/global_std_female.npy')) read_audio(audio_paths=audio_paths, speaker_dict=speaker_dict_dict[data_type], tool=args.tool, config=CONFIG, normalize=args.normalize, is_training=is_training, save_path=mkdir_join(input_save_path, data_type), save_format=args.save_format, global_mean_male=global_mean_male, global_std_male=global_std_male, global_mean_female=global_mean_female, global_std_female=global_std_female) # NOTE: ex.) save_path: # csj/feature/save_format/data_size/data_type/speaker/*.npy # Make a confirmation file to prove that dataset was saved # correctly with open(join(input_save_path, data_type, 'complete.txt'), 'w') as f: f.write('') ######################################## # dataset (csv) ######################################## print('\n=> Saving dataset files...') for data_type in ['train', 'eval1', 'eval2', 'eval3']: dataset_save_path = mkdir_join(args.dataset_save_path, args.save_format, data_size, data_type) print('---------- %s ----------' % data_type) df_kanji = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_kana = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_phone = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) utt_count = 0 df_kanji_list, df_kana_list, df_phone_list = [], [], [] for speaker, utt_dict in tqdm(speaker_dict_dict[data_type].items()): for utt_index, utt_info in utt_dict.items(): trans_kanji, trans_kana, trans_phone = utt_info[2:] if args.save_format == 'numpy': input_utt_save_path = join( input_save_path, data_type, speaker, speaker + '_' + utt_index + '.npy') assert isfile(input_utt_save_path) input_utt = np.load(input_utt_save_path) elif args.save_format == 'htk': input_utt_save_path = join( input_save_path, data_type, speaker, speaker + '_' + utt_index + '.htk') assert isfile(input_utt_save_path) input_utt, _, _ = read(input_utt_save_path) elif args.save_format == 'wav': input_utt_save_path = path.utt2wav(utt_index) assert isfile(input_utt_save_path) input_utt = w2f_psf(input_utt_save_path, feature_type=CONFIG['feature_type'], feature_dim=CONFIG['channels'], use_energy=CONFIG['energy'], use_delta1=CONFIG['delta'], use_delta2=CONFIG['deltadelta'], window=CONFIG['window'], slide=CONFIG['slide']) else: raise ValueError('save_format is numpy or htk or wav.') frame_num = input_utt.shape[0] del input_utt series_kanji = pd.Series( [frame_num, input_utt_save_path, trans_kanji], index=df_kanji.columns) series_kana = pd.Series( [frame_num, input_utt_save_path, trans_kana], index=df_kana.columns) series_phone = pd.Series( [frame_num, input_utt_save_path, trans_phone], index=df_phone.columns) df_kanji = df_kanji.append(series_kanji, ignore_index=True) df_kana = df_kana.append(series_kana, ignore_index=True) df_phone = df_phone.append(series_phone, ignore_index=True) utt_count += 1 # Reset if utt_count == 50000: df_kanji_list.append(df_kanji) df_kana_list.append(df_kana) df_phone_list.append(df_phone) df_kanji = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_kana = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) df_phone = pd.DataFrame( [], columns=['frame_num', 'input_path', 'transcript']) utt_count = 0 # Last dataframe df_kanji_list.append(df_kanji) df_kana_list.append(df_kana) df_phone_list.append(df_phone) # Concatenate all dataframes df_kanji = df_kanji_list[0] df_kana = df_kana_list[0] df_phone = df_phone_list[0] for df_i in df_kanji_list[1:]: df_kanji = pd.concat([df_kanji, df_i], axis=0) for df_i in df_kana_list[1:]: df_kana = pd.concat([df_kana, df_i], axis=0) for df_i in df_phone_list[1:]: df_phone = pd.concat([df_phone, df_i], axis=0) df_kanji.to_csv(join(dataset_save_path, 'dataset_kanji.csv')) df_kana.to_csv(join(dataset_save_path, 'dataset_kana.csv')) df_phone.to_csv(join(dataset_save_path, 'dataset_phone.csv')) # Use the first 4000 utterances as the dev set if data_type == 'train': df_kanji[:4000].to_csv( mkdir_join(args.dataset_save_path, args.save_format, data_size, 'dev', 'dataset_kanji.csv')) df_kana[:4000].to_csv( mkdir_join(args.dataset_save_path, args.save_format, data_size, 'dev', 'dataset_kana.csv')) df_phone[:4000].to_csv( mkdir_join(args.dataset_save_path, args.save_format, data_size, 'dev', 'dataset_phone.csv'))
def read_audio(audio_paths, speaker_dict, tool, config, normalize, is_training, save_path=None, save_format=None, global_mean=None, global_std=None, dtype=np.float32): """Read HTK or WAV files. Args: audio_paths (list): paths to HTK or WAV files speaker_dict (dict): A dictionary of speakers' gender information key (string) => speaker value (dict) => dictionary of utterance information of each speaker key (string) => utterance index value (list) => [start_frame, end_frame, transcript] tool (string): the tool to extract features, htk or librosa or python_speech_features config (dict): a configuration for feature extraction normalize (string): no => normalization will be not conducted global => normalize input features by global mean & std over the training set per gender speaker => normalize input features by mean & std per speaker utterance => normalize input features by mean & std per utterancet data by mean & std per utterance is_training (bool): training or not save_path (string): path to save npy files save_format (string, optional): numpy as htk global_mean (np.ndarray, optional): global mean over the training set global_std (np.ndarray, optional): global standard deviation over the training set dtype (optional): the type of data, default is np.float32 Returns: global_mean (np.ndarray): global mean over the training set global_std (np.ndarray): global standard deviation over the training set frame_num_dict (dict): key => utterance name value => the number of frames """ if not is_training: if global_mean is None or global_std is None: raise ValueError('Set mean & std computed in the training set.') if normalize not in ['global', 'speaker', 'utterance', 'no']: raise ValueError( 'normalize must be "utterance" or "speaker" or "global" or "no".') total_frame_num = 0 total_frame_num_dict = {} speaker_mean_dict = {} # Loop 1: Computing global mean and statistics if is_training and normalize != 'no': print('=====> Reading audio files...') for i, audio_path in enumerate(tqdm(audio_paths)): speaker = basename(audio_path).split('.')[0] # Fix speaker name speaker = speaker.replace('sw0', 'sw') # ex.) sw04771-A => sw4771-A (LDC97S62) speaker = speaker.replace('sw_', 'sw') # ex.) sw_4771-A => sw4771-A (eval2000, swbd) speaker = speaker.replace('en_', 'en') # ex.) en_4156-A => en4156-A (eval2000, ch) # Divide each audio file into utterances _, input_utt_sum, speaker_mean, _, total_frame_num_speaker = segment( audio_path, speaker, speaker_dict[speaker], is_training=True, sil_duration=0, tool=tool, config=config) if i == 0: # Initialize global statistics feature_dim = input_utt_sum.shape[0] global_mean = np.zeros((feature_dim, ), dtype=dtype) global_std = np.zeros((feature_dim, ), dtype=dtype) global_mean += input_utt_sum total_frame_num += total_frame_num_speaker # For computing speaker stddev if normalize == 'speaker': speaker_mean_dict[speaker] = speaker_mean total_frame_num_dict[speaker] = total_frame_num_speaker # NOTE: speaker mean is already computed print('=====> Computing global mean & stddev...') # Compute global mean global_mean /= total_frame_num for audio_path in tqdm(audio_paths): speaker = basename(audio_path).split('.')[0] # Normalize speaker name speaker = speaker.replace('sw0', 'sw') speaker = speaker.replace('sw_', 'sw') speaker = speaker.replace('en_', 'en') # Divide each audio into utterances input_data_dict_speaker, _, _, _, _ = segment( audio_path, speaker, speaker_dict[speaker], is_training=True, sil_duration=0, tool=tool, config=config) # For computing global stddev for input_utt in input_data_dict_speaker.values(): global_std += np.sum(np.abs(input_utt - global_mean)**2, axis=0) # Compute global stddev global_std = np.sqrt(global_std / (total_frame_num - 1)) if save_path is not None: # Save global mean & std per gender np.save(join(save_path, 'global_mean.npy'), global_mean) np.save(join(save_path, 'global_std.npy'), global_std) # Loop 2: Normalization and Saving print('=====> Normalization...') frame_num_dict = {} sampPeriod, parmKind = None, None for audio_path in tqdm(audio_paths): speaker = basename(audio_path).split('.')[0] # Normalize speaker name speaker = speaker.replace('sw0', 'sw') speaker = speaker.replace('sw_', 'sw') speaker = speaker.replace('en_', 'en') if normalize == 'speaker' and is_training: speaker_mean = speaker_mean_dict[speaker] else: speaker_mean = None # Divide each audio into utterances input_data_dict_speaker, _, speaker_mean, speaker_std, _ = segment( audio_path, speaker, speaker_dict[speaker], is_training=is_training, sil_duration=0, tool=tool, config=config, mean=speaker_mean) # for compute speaker sttdev # NOTE: input_data_dict_speaker have been not normalized yet for utt_index, input_utt in input_data_dict_speaker.items(): if normalize == 'no': pass elif normalize == 'global' or not is_training: # Normalize by mean & std over the training set input_utt -= global_mean input_utt /= global_std elif normalize == 'speaker': # Normalize by mean & std per speaker input_utt = (input_utt - speaker_mean) / speaker_std elif normalize == 'utterance': # Normalize by mean & std per utterance utt_mean = np.mean(input_utt, axis=0, dtype=dtype) utt_std = np.std(input_utt, axis=0, dtype=dtype) input_utt = (input_utt - utt_mean) / utt_std else: ValueError frame_num_dict[speaker + '_' + utt_index] = input_utt.shape[0] if save_path is not None: # Save input features if save_format == 'numpy': input_data_save_path = mkdir_join( save_path, speaker, speaker + '_' + utt_index + '.npy') np.save(input_data_save_path, input_utt) elif save_format == 'htk': if sampPeriod is None: _, sampPeriod, parmKind = read(audio_path) write(input_utt, htk_path=mkdir_join( save_path, speaker, speaker + '_' + utt_index + '.htk'), sampPeriod=sampPeriod, parmKind=parmKind) else: raise ValueError('save_format is numpy or htk.') if save_path is not None: # Save the frame number dictionary with open(join(save_path, 'frame_num.pickle'), 'wb') as f: pickle.dump(frame_num_dict, f) return global_mean, global_std, frame_num_dict