def find_longest_sequence(data_dir, speakers_list, basenames_list): """Find the number of speech frames from the longest sequence among all speakers # Arguments data_dir: directory of data to be used in the datatable. speakers_list: list of speakers to be used basenames_list: list of filenames to be used # Returns An integer with the number of frames of the longest sequence""" longest_sequence = 0 for speaker in speakers_list: for basename in basenames_list: params = parse_file( 1, path_join(data_dir, 'vocoded_s2s', speaker, basename + '.' + 'lf0' + '.dat')) if params.shape[0] > longest_sequence: longest_sequence = params.shape[0] return longest_sequence
def find_longest_sequence(data_dir, speakers_list, basenames_list): """Find the number of speech frames from the longest sequence among all speakers # Arguments data_dir: directory of data to be used in the datatable. This path must end in a '/' speakers_list: list of speakers to be used basenames_list: list of filenames to be used # Returns An integer with the number of frames of the longest sequence""" # Check that data_dir ends in a '/' try: assert data_dir[len(data_dir) - 1] == '/' except AssertionError: print("Please, make sure the data directory string ends with a '/'") longest_sequence = 0 for speaker in speakers_list: for basename in basenames_list: params = parse_file( 1, data_dir + 'vocoded_s2s/' + speaker + '/' + basename + '.' + 'lf0' + '.dat') if params.shape[0] > longest_sequence: longest_sequence = params.shape[0] return longest_sequence
def seq2seq_build_file_table(source_dir, src_index, target_dir, trg_index, basename, longest_seq): """Build a datatable from the vocoded parameters of a sequence from a source-target pair of speakers # Arguments source_dir: directory path to the source files src_index: index (0-9) of the source speaker in the speakers list target_dir: directory path to the target files trg_index: index (0-9) of the target speaker in the speakers list basename: name without extension of the file's params to be prepared longest_seq: number of frames of the longest sequence in the database All directory paths must end in '/' # Returns - Zero-padded (by frames) source and target datatables - Source and target mask vectors indicating which frames are padded (0) and which of them are original from the data (1) The mask vectors are to be used in Keras' fit method""" # Check that source_dir and target_dir end in a '/' try: assert source_dir[-1] == '/' except AssertionError: print( "Please make sure the source data directory string ends with a '/'" ) try: assert target_dir[-1] == '/' except AssertionError: print( "Please make sure the target data directory string ends with a '/'" ) # Parse parameter files source_mcp = parse_file(40, source_dir + basename + '.' + 'mcp' + '.dat') source_f0 = parse_file(1, source_dir + basename + '.' + 'lf0' + '.dat') source_f0_i = parse_file(1, source_dir + basename + '.' + 'lf0' + '.i.dat') # Interpolated data source_vf = parse_file(1, source_dir + basename + '.' + 'vf' + '.dat') source_vf_i = parse_file(1, source_dir + basename + '.' + 'vf' + '.i.dat') # Use interpolated data target_mcp = parse_file(40, target_dir + basename + '.' + 'mcp' + '.dat') target_f0 = parse_file(1, target_dir + basename + '.' + 'lf0' + '.dat') target_f0_i = parse_file(1, target_dir + basename + '.' + 'lf0' + '.i.dat') # Use interpolated data target_vf = parse_file(1, target_dir + basename + '.' + 'vf' + '.dat') target_vf_i = parse_file(1, target_dir + basename + '.' + 'vf' + '.i.dat') # Use interpolated data # Build voiced/unvoiced flag arrays # The flags are: # 1 -> voiced # 0 -> unvoiced assert source_vf.shape == source_f0.shape source_voiced = np.empty(source_vf.shape) for index, vf in enumerate(source_vf): source_voiced[index] = 1 - kronecker_delta(source_vf[index]) assert target_vf.shape == target_f0.shape target_voiced = np.empty(target_vf.shape) for index, vf in enumerate(target_vf): target_voiced[index] = 1 - kronecker_delta(target_vf[index]) # Initialize End-Of-Sequence flag src_eos_flag = np.zeros(source_vf.shape) src_eos_flag[-1, :] = 1 trg_eos_flag = np.zeros(target_vf.shape) trg_eos_flag[-1, :] = 1 # Initialize one-hot-encoded speaker indexes src_spk_index = to_categorical(src_index * np.ones( (source_vf.shape[0], ), dtype=int), nb_classes=10) trg_spk_index = to_categorical(trg_index * np.ones( (target_vf.shape[0], ), dtype=int), nb_classes=10) # Initialize padding masks, to be passed into keras' fit # Source mask source_mask = np.concatenate((np.zeros( (longest_seq - source_mcp.shape[0], 1)), np.ones((source_mcp.shape[0], 1)))) # Target mask target_mask = np.concatenate((np.ones((target_mcp.shape[0], 1)), np.zeros( (longest_seq - target_mcp.shape[0], 1)))) assert source_mask.shape == target_mask.shape # Concatenate zero-padded source and target params source_params = np.concatenate( (zero_pad_params(longest_seq, 'src', source_mcp), zero_pad_params(longest_seq, 'src', source_f0_i), zero_pad_params(longest_seq, 'src', source_vf_i), zero_pad_params(longest_seq, 'src', source_voiced), zero_pad_params(longest_seq, 'src', src_eos_flag), zero_pad_params(longest_seq, 'src', src_spk_index), zero_pad_params(longest_seq, 'src', trg_spk_index)), axis=1) target_params = np.concatenate( (zero_pad_params(longest_seq, 'trg', target_mcp), zero_pad_params(longest_seq, 'trg', target_f0_i), zero_pad_params(longest_seq, 'trg', target_vf_i), zero_pad_params(longest_seq, 'trg', target_voiced), zero_pad_params(longest_seq, 'trg', trg_eos_flag)), axis=1) return source_params, source_mask, target_params, target_mask
import numpy as np from tfglib.construct_table import parse_file # Define usage constant usage = 'Usage: normalize.py [file_directory] [filename]' + \ '[number of parameters]' if len(argv) == 1 or \ (len(argv) == 2 and (argv[1] == '-h' or argv[1] == '--help')): print(usage + "\n\nThe file directory must end in a '/'.") elif len(argv) == 4: # Parse input file data = parse_file(int(argv[3]), argv[1] + argv[2]) # Normalize data data_mean = np.mean(data, axis=0) data_std = np.std(data, axis=0) normalized_data = (data - data_mean) / data_std # Output normalized data to file np.savetxt(argv[1] + argv[2] + '.norm', normalized_data[:, 1:normalized_data.shape[1]], fmt='%.18f', delimiter='\t') else: exit('Please, input two arguments as indicated in the usage.\n\n' + usage)
# Created by albert aparicio on 14/11/16 # coding: utf-8 # This script takes a .frames.txt file from SPTK's dtw output and converts # it to time frames (5 ms frames) # # This script expects the path to the .frames.txt file to be passed as an # argument from sys import argv import numpy as np from tfglib.construct_table import parse_file # Check that a file parameter has been passed assert len(argv) == 2 # Parse frames and convert them into time frames = 0.005 * parse_file(2, argv[1]) # Split the filename at each '.' sp = argv[1].split('.') # Output the resulting frames np.savetxt(sp[0] + '.frames.dtw', frames, fmt='%.3f', delimiter='\t', header='REF\tTST\n----\t----')
def seq2seq_build_file_table(self, source_dir, src_index, target_dir, trg_index, basename): """Build a datatable from the vocoded parameters of a sequence from a source-target pair of speakers # Arguments source_dir: directory path to the source files src_index: index (0-9) of the source speaker in the speakers list target_dir: directory path to the target files trg_index: index (0-9) of the target speaker in the speakers list basename: name without extension of the file's params to be prepared longest_seq: number of frames of the longest sequence in the database # Returns - Zero-padded (by frames) source and target datatables - Source and target mask vectors indicating which frames are padded (0) and which of them are original from the data (1) The mask vectors are to be used in Keras' fit method""" # Parse parameter files settings_dict = { 'source': { 'dir': source_dir, 'params': { 'mcp': 40, 'lf0': 1, 'lf0.i': 1, 'vf': 1, 'vf.i': 1 } }, 'target': { 'dir': target_dir, 'params': { 'mcp': 40, 'lf0': 1, 'lf0.i': 1, 'vf': 1, 'vf.i': 1 }, } } params_dict = {} for src_trg_key, src_trg_dict in settings_dict.items(): params_dict[src_trg_key] = {} for extension, param_len in src_trg_dict['params'].items(): params_dict[src_trg_key][extension] = parse_file( param_len, path_join(src_trg_dict['dir'], basename + '.' + extension + '.dat')) # Build voiced/unvoiced flag arrays # The flags are: # 1 -> voiced # 0 -> unvoiced assert params_dict['source']['vf'].shape == params_dict['source'][ 'lf0'].shape params_dict['source']['uv'] = np.empty( params_dict['source']['vf'].shape, dtype=np.uint8) for index, vf in enumerate(params_dict['source']['vf']): params_dict['source']['uv'][index] = int( 1 - kronecker_delta(params_dict['source']['vf'][index])) assert params_dict['target']['vf'].shape == params_dict['target'][ 'lf0'].shape params_dict['target']['uv'] = np.empty( params_dict['target']['vf'].shape, dtype=np.uint8) for index, vf in enumerate(params_dict['target']['vf']): params_dict['target']['uv'][index] = int( 1 - kronecker_delta(params_dict['target']['vf'][index])) if self.shortseq: split_params = {} # - Split parameter vectors into chunks of size self.max_seq_length, # with a # superposition of self.max_seq_length/2 # - The last sub-sequence is padded with zeros # - Masks contain True for valid values, and False for padded values for origin, param_types in params_dict.items(): split_params[origin] = {} for param_type, parameters in param_types.items(): split_params[origin][param_type] = {} ( split_params[origin][param_type]['params'], split_params[origin][param_type]['mask'], split_params[origin][param_type]['seq_len'] ) = sliding_window( parameters, self.max_seq_length, mode=origin, # Hardcode padding mode at the left of the chunk # parameters, self.max_seq_length, mode='source', step=int(self.max_seq_length / 2)) # Initialize an EOS flag vector for each sub-sequence split_params['source']['eos'] = np.zeros( split_params['source']['vf']['params'].shape, dtype=np.uint8) split_params['source']['eos'][:, -1] = 1 split_params['target']['eos'] = np.zeros( split_params['target']['vf']['params'].shape, dtype=np.uint8) split_params['target']['eos'][:, -1] = 1 # Assign a speaker index to each sub-sequence split_params['source']['src_spk'] = src_index * np.ones( split_params['source']['eos'].shape, dtype=np.int) split_params['source']['trg_spk'] = trg_index * np.ones( split_params['source']['eos'].shape, dtype=np.int) source_params = np.concatenate(( split_params['source']['mcp']['params'], split_params['source']['lf0.i']['params'], split_params['source']['vf.i']['params'], split_params['source']['uv']['params'], split_params['source']['eos'], split_params['source']['src_spk'], split_params['source']['trg_spk'], ), axis=2) source_mask = split_params['source']['vf.i']['mask'] source_seq_len = split_params['source']['vf.i']['seq_len'] target_params = np.concatenate( (split_params['target']['mcp']['params'], split_params['target']['lf0.i']['params'], split_params['target']['vf.i']['params'], split_params['target']['uv']['params'], split_params['target']['eos']), axis=2) target_mask = split_params['target']['vf.i']['mask'] target_seq_len = split_params['target']['vf.i']['seq_len'] else: # Initialize End-Of-Sequence flag src_eos_flag = np.zeros(params_dict['source']['vf'].shape) src_eos_flag[-1, :] = 1 trg_eos_flag = np.zeros(params_dict['target']['vf'].shape) trg_eos_flag[-1, :] = 1 # Initialize one-hot-encoded speaker indexes src_spk_index = to_categorical( src_index * np.ones( (params_dict['source']['vf'].shape[0], ), dtype=int), 10) trg_spk_index = to_categorical( trg_index * np.ones( (params_dict['target']['vf'].shape[0], ), dtype=int), 10) # Initialize padding masks, to be passed into keras' fit # Source mask source_mask = np.concatenate((np.zeros( (self.max_seq_length - params_dict['source']['mcp'].shape[0], 1)), np.ones((params_dict['source']['mcp'].shape[0], 1)))) # Target mask target_mask = np.concatenate( (np.ones((params_dict['target']['mcp'].shape[0], 1)), np.zeros((self.max_seq_length - params_dict['target']['mcp'].shape[0], 1)))) assert source_mask.shape == target_mask.shape # Concatenate zero-padded source and target params source_params = np.concatenate( (zero_pad_params(self.max_seq_length, 'src', params_dict['source']['mcp']), zero_pad_params(self.max_seq_length, 'src', params_dict['source']['lf0.i']), zero_pad_params(self.max_seq_length, 'src', params_dict['source']['vf.i']), zero_pad_params(self.max_seq_length, 'src', params_dict['source']['uv']), zero_pad_params(self.max_seq_length, 'src', src_eos_flag), zero_pad_params(self.max_seq_length, 'src', src_spk_index), zero_pad_params(self.max_seq_length, 'src', trg_spk_index)), axis=1) target_params = np.concatenate( (zero_pad_params(self.max_seq_length, 'trg', params_dict['target']['mcp']), zero_pad_params(self.max_seq_length, 'trg', params_dict['target']['lf0.i']), zero_pad_params(self.max_seq_length, 'trg', params_dict['target']['vf.i']), zero_pad_params(self.max_seq_length, 'trg', params_dict['target']['uv']), zero_pad_params(self.max_seq_length, 'trg', trg_eos_flag)), axis=1) source_seq_len = target_seq_len = None # TODO Return basename with sequence(s) return (source_params, source_mask, source_seq_len, target_params, target_mask, target_seq_len)
# Load basenames # ################## basenames_file = open('data/test/basenames.list', 'r') basenames_lines = basenames_file.readlines() # Strip '\n' characters basenames = [line.split('\n')[0] for line in basenames_lines] ################### # Loop over files # ################### for basename in basenames: ################### # Load parameters # ################### mcp_params = parse_file(40, 'data/test/vocoded/SF1/' + basename + '.mcp.dat') lf0_params = parse_file(1, 'data/test/vocoded/SF1/' + basename + '.lf0.i.dat') mvf_params = parse_file(1, 'data/test/vocoded/SF1/' + basename + '.vf.i.dat') # Compute U/V flags assert mvf_params.shape == lf0_params.shape uv_flags = np.empty(mvf_params.shape) for index, vf in enumerate(uv_flags): uv_flags[index] = 1 - utils.kronecker_delta(mvf_params[index]) # Prepare data for prediction mcp_params = (mcp_params - src_mcp_mean) / src_mcp_std mcp_params = utils.reshape_lstm(mcp_params, mcp_tsteps, mcp_data_dim)
def pretrain_save_data_parameters( data_dir, speakers_file='speakers.list', params_file='pretrain_params.h5', ): # TODO Document this function # Save processing start time start_time = time() print('Starting') longest_sequence = 0 files_list = [] num_spk = len([entry for entry in scandir(data_dir) if entry.is_dir()]) spk_max = np.zeros((num_spk, 42)) spk_min = 1e+50 * np.ones((num_spk, 42)) speakers = open(os.path.join(data_dir, speakers_file), 'r').readlines() # Strip '\n' characters dirs = [line.split('\n')[0] for line in speakers] print("Processing speakers' data") for spk_index, a_dir in enumerate(dirs): for sub_root, _, sub_files in os.walk(os.path.join(data_dir, a_dir)): # Get basenames of files in directory basenames = list( set([ os.path.join(sub_root, file.split('.')[0]) for file in sub_files ])) files_list += basenames for basename in basenames: print('Processing ' + basename) lf0_params = parse_file(1, basename + '.lf0_log') if lf0_params.shape[0] > longest_sequence: longest_sequence = lf0_params.shape[0] mcp_params = parse_file(40, basename + '.cc') mvf_params = parse_file(1, basename + '.i.fv') seq_params = np.concatenate( (mcp_params, lf0_params, mvf_params), axis=1) # Compute maximum and minimum values spk_max[spk_index, :] = np.maximum( spk_max[spk_index, :], np.ma.max(seq_params, axis=0)) spk_min[spk_index, :] = np.minimum( spk_min[spk_index, :], np.ma.min(seq_params, axis=0)) print('Saving data to .h5 file') with File(os.path.join(data_dir, params_file), 'w') as f: # Save longest_sequence and the max and min values as attributes f.attrs.create('longest_sequence', longest_sequence, dtype=int) f.attrs.create('speakers_max', spk_max) f.attrs.create('speakers_min', spk_min) # TODO Support Python 2 # sys.version_info -> Get running Python version dt = special_dtype(vlen=str) utf_list = [ n.encode(encoding="utf-8", errors="ignore") for n in files_list ] f.create_dataset(name='files_list', shape=(len(utf_list), 1), data=utf_list, dtype=dt) f.close() print('Elapsed time: ' + display_time(time() - start_time)) longest_sequence = int(np.floor(longest_sequence * 1.7)) return longest_sequence, spk_max, spk_min, files_list
def prepare_pretrain_slice(files_list, params_path, longest_sequence, spk_max, spk_min, speakers_file='speakers.list', dtw_prob_file='dtw_probabilities.h5', basename_len=11, shuffle_files=True, replicate=True): speakers = open(os.path.join(params_path, speakers_file), 'r').readlines() # Strip '\n' characters speakers = [line.split('\n')[0] for line in speakers] with File(os.path.join(params_path, dtw_prob_file), 'r') as f: # Save numbers and probabilities values = f['values'][:] probabilities = f['probabilities'][:] f.close() # Read all files to have them loaded in memory mcp_params = [] lf0_params = [] mvf_params = [] uv_flags = [] for basename in files_list: mcp_params.append(parse_file(40, basename + '.cc')) lf0_params.append(parse_file(1, basename + '.lf0_log')) mvf_params.append(parse_file(1, basename + '.i.fv')) uv_flags.append(parse_file(1, basename + '.lf0_log.uv_mask')) # Initialize file indexes indexes = np.arange(len(files_list)) while True: if shuffle_files: # Shuffle file indexs before each epoch np.random.shuffle(indexes) # Iterate over shuffled files for file_index in indexes: # Compute speaker index basename = files_list[file_index] spk_index = speakers.index( str(basename[-1 * (basename_len + 3):-1 * (basename_len + 1)])) # # Read parameters # mcp_params = parse_file(40, basename + '.cc') # lf0_params = parse_file(1, basename + '.lf0_log') # mvf_params = parse_file(1, basename + '.i.fv') # uv_flags = parse_file(1, basename + '.lf0_log.uv_mask') # Get max and min values for each speaker src_spk_max = spk_max[spk_index, :] src_spk_min = spk_min[spk_index, :] # Maxmin normalize src_normalized = (np.concatenate( (mcp_params[file_index], lf0_params[file_index], mvf_params[file_index]), axis=1) - src_spk_min) / (src_spk_max - src_spk_min) # One-hot encode the speaker indexes spk_index_vector = np.repeat(spk_index, lf0_params[file_index].shape[0], axis=0) # Construct End-Of-Sequence flags eos_flags = np.zeros(lf0_params[file_index].shape[0]) eos_flags[-1] = 1 # Construct sequence "slice" seq_params = np.concatenate(( src_normalized, uv_flags[file_index], np.reshape(eos_flags, (-1, 1)), np.reshape(spk_index_vector, (-1, 1)), np.reshape(spk_index_vector, (-1, 1)), ), axis=1) if replicate: # Replicate frames with dtw probabilities # TODO Change the function so it takes seq_params as separate args (src_res, trg_res, _, trg_mask) = replicate_frames(seq_params, longest_sequence, values, probabilities) else: src_res = np.concatenate( (seq_params, np.zeros((longest_sequence - seq_params.shape[0], seq_params.shape[1])))) trg_res = np.concatenate( (seq_params[:, 0:44], np.zeros((longest_sequence - seq_params.shape[0], 44)))) trg_mask = np.concatenate( (np.ones((seq_params.shape[0], 1)), np.zeros((longest_sequence - seq_params.shape[0], 1)))) # Prepare feedback data feedback_data = np.roll(trg_res, 1, axis=0) feedback_data[0, :] = 0 # Return slice frames # print('Sliced ' + basename) yield (src_res[:, 0:44], src_res[:, 44:45].reshape( (-1)), src_res[:, 45:46].reshape((-1)), feedback_data, trg_res[:, 0:42], trg_res[:, 42:44], trg_mask)
fpr[src_spk][trg_spk] = dict() tpr[src_spk][trg_spk] = dict() roc_auc[src_spk][trg_spk] = dict() # for i in range(src_test_datatable.shape[0]): for i in range(len(basenames)): # TODO Consider plotting an averaged ROC for each spk combination print(src_spk + '->' + trg_spk + ' ' + basenames[i]) # TODO figure out if this is necessary # fpr[src_spk][trg_spk][basenames[i]] = dict() # tpr[src_spk][trg_spk][basenames[i]] = dict() # roc_auc[src_spk][trg_spk][basenames[i]] = dict() # Load raw U/V flags raw_uv = parse_file( 1, 'data/test/s2s_predicted/' + src_spk + '-' + trg_spk + '/' + basenames[i] + '.uv.dat') # Round U/V flags rounded_uv = np.round(raw_uv) # Compute ROC curve and the area under it (fpr[src_spk][trg_spk][basenames[i]], tpr[src_spk][trg_spk][basenames[i]], _) = roc_curve( trg_test_datatable[i + (src_spk_ind + trg_spk_ind) * len(basenames), :, 42], rounded_uv) roc_auc[src_spk][trg_spk][basenames[i]] = auc( fpr[src_spk][trg_spk][basenames[i]], tpr[src_spk][trg_spk][basenames[i]])