def preprocess_data(num_mfcc_coeffs, num_filters, window_len, window_step, max_num_frames): """Processes the training data and returns MFCC vectors for all of them. Args: config: the Config object with various parameters specified Returns: train_data: A list of tuples, one for each training example: (accent 1 padded MFCC frames, accent 1 mask) train_labels: A list of tuples, one for each training example: (accent 2 padded MFCC frames, accent 2 mask) """ inputs = [] labels = [] SOURCE_DIR = '../data/cmu_arctic/us-english-female-slt/wav/' TARGET_DIR = '../data/cmu_arctic/us-english-male-bdl/wav/' index = 0 for source_fname, target_fname in zip(os.listdir(SOURCE_DIR), os.listdir(TARGET_DIR)): if index >= 20: break index += 1 if source_fname == '.DS_Store' or target_fname == '.DS_Store': continue (source_sample_rate, source_wav_data) = wav.read(SOURCE_DIR + source_fname) (target_sample_rate, target_wav_data) = wav.read(TARGET_DIR + target_fname) source_mfcc_features = np.array( mfcc(source_wav_data, samplerate=source_sample_rate, numcep=num_mfcc_coeffs, nfilt=num_filters, winlen=window_len, winstep=window_step)) target_mfcc_features = np.array( mfcc(target_wav_data, samplerate=target_sample_rate, numcep=num_mfcc_coeffs, nfilt=num_filters, winlen=window_len, winstep=window_step)) # align with FastDTW source_mfcc_features, target_mfcc_features = get_dtw_series( source_mfcc_features, target_mfcc_features) # pad MFCC feature matrices (rows) to max_num_frames source_padded_frames = pad_sequence(source_mfcc_features, max_num_frames) target_padded_frames = pad_sequence(target_mfcc_features, max_num_frames) inputs.append(source_padded_frames) labels.append(target_padded_frames) return inputs, labels
def preprocess_data(num_mfcc_coeffs, max_num_frames): """Processes the training data and returns MFCC vectors for all of them. Args: num_mfcc_coeffs, max_num_frames Returns: train_data: A list of tuples, one for each training example: (accent 1 padded MFCC frames, accent 1 mask) train_labels: A list of tuples, one for each training example: (accent 2 padded MFCC frames, accent 2 mask) """ inputs = [] labels = [] input_masks = [] label_masks = [] #SOURCE_DIR = '../data/cmu_arctic/us-english-male-bdl/wav/' #TARGET_DIR = '../data/cmu_arctic/scottish-english-male-awb/wav/' SOURCE_DIR = '../data/cmu_arctic/mini_a/' TARGET_DIR = '../data/cmu_arctic/mini_b/' for source_fname, target_fname in zip(os.listdir(SOURCE_DIR), os.listdir(TARGET_DIR)): (source_sample_rate, source_wav_data) = wav.read(SOURCE_DIR + source_fname) (target_sample_rate, target_wav_data) = wav.read(TARGET_DIR + target_fname) source_mfcc_features = np.array( mfcc(source_wav_data, samplerate=source_sample_rate, numcep=num_mfcc_coeffs)) target_mfcc_features = np.array( mfcc(target_wav_data, samplerate=target_sample_rate, numcep=num_mfcc_coeffs)) # Aligns the MFCC features matrices using FastDTW. source_mfcc_features, target_mfcc_features = get_dtw_series( source_mfcc_features, target_mfcc_features) # Pads the MFCC feature matrices (rows) to length config.max_num_frames source_padded_frames, source_mask = pad_sequence( source_mfcc_features, max_num_frames) target_padded_frames, target_mask = pad_sequence( target_mfcc_features, max_num_frames) inputs.append(source_padded_frames) input_masks.append(source_mask) labels.append(target_padded_frames) label_masks.append(target_mask) randomized_indices = range(0, len(inputs)) random.shuffle(randomized_indices) inputs = [inputs[i] for i in randomized_indices] input_masks = [input_masks[i] for i in randomized_indices] labels = [labels[i] for i in randomized_indices] label_masks = [label_masks[i] for i in randomized_indices] return inputs, labels
def process_data(self, source_input_dir, target_input_dir): inputs = [] labels = [] index = 0 for source_fname, target_fname in zip(os.listdir(source_input_dir), os.listdir(target_input_dir)): #if index >= 20: # break #index += 1 if source_fname == '.DS_Store' or target_fname == '.DS_Store': continue (source_sample_rate, source_wav_data) = wav.read(source_input_dir + source_fname) (target_sample_rate, target_wav_data) = wav.read(target_input_dir + target_fname) # appendEnergy is False because we want to keep the 0th coefficient source_mfcc_features = np.array( mfcc(source_wav_data, samplerate=source_sample_rate, numcep=self.config.num_mfcc_coeffs, nfilt=self.config.num_filters, winlen=self.config.window_len, winstep=self.config.window_step, appendEnergy=False)) target_mfcc_features = np.array( mfcc(target_wav_data, samplerate=target_sample_rate, numcep=self.config.num_mfcc_coeffs, nfilt=self.config.num_filters, winlen=self.config.window_len, winstep=self.config.window_step, appendEnergy=False)) # Aligns the MFCC features matrices using FastDTW. source_mfcc_features, target_mfcc_features = get_dtw_series( source_mfcc_features, target_mfcc_features) # Pads the MFCC feature matrices (rows) to length config.max_num_frames source_padded_frames, _ = pad_sequence(source_mfcc_features, self.config.max_num_frames) target_padded_frames, _ = pad_sequence(target_mfcc_features, self.config.max_num_frames) inputs.append(source_padded_frames) labels.append(target_padded_frames) return inputs, labels
def preprocess_data(self, config): """Processes the training data and returns MFCC vectors for all of them. Args: config: the Config object with various parameters specified Returns: train_data: A list of tuples, one for each training example: (accent 1 padded MFCC frames, accent 1 mask) train_labels: A list of tuples, one for each training example: (accent 2 padded MFCC frames, accent 2 mask) """ inputs = [] labels = [] TARGET_DIR = '../data/cmu_arctic/scottish-english-male-awb/wav/' SOURCE_DIR = '../data/cmu_arctic/us-english-male-bdl/wav/' index = 0 for source_fname, target_fname in zip(os.listdir(SOURCE_DIR), os.listdir(TARGET_DIR)): if index >= 500: break index += 1 if source_fname == '.DS_Store' or target_fname == '.DS_Store': continue (source_sample_rate, source_wav_data) = wav.read(SOURCE_DIR + source_fname) (target_sample_rate, target_wav_data) = wav.read(TARGET_DIR + target_fname) # Aligns the MFCC features matrices using FastDTW. source_features, target_features = get_dtw_series( source_wav_data, target_wav_data) # Pads the MFCC feature matrices (rows) to length config.max_num_frames source_padded_frames = self.pad_sequence(source_features) target_padded_frames = self.pad_sequence(target_features) inputs.append(source_padded_frames) labels.append(target_padded_frames) return inputs, labels
def preprocess_data(self, config): """Processes the training data and returns MFCC vectors for all of them. Args: config: the Config object with various parameters specified Returns: train_data: A list of tuples, one for each training example: (accent 1 padded MFCC frames, accent 1 mask) train_labels: A list of tuples, one for each training example: (accent 2 padded MFCC frames, accent 2 mask) """ inputs = [] inputs_corrupted = [] labels = [] input_masks = [] label_masks = [] SOURCE_DIR = '../data/cmu_arctic/us-english-male-bdl/wav/' TARGET_DIR = '../data/cmu_arctic/scottish-english-male-awb/wav/' for source_fname, target_fname in zip(os.listdir(SOURCE_DIR), os.listdir(TARGET_DIR)): (source_sample_rate, source_wav_data) = wav.read(SOURCE_DIR + source_fname) (target_sample_rate, target_wav_data) = wav.read(TARGET_DIR + target_fname) source_mfcc_features = np.array(mfcc(source_wav_data, samplerate=source_sample_rate, numcep=self.config.num_mfcc_coeffs)) target_mfcc_features = np.array(mfcc(target_wav_data, samplerate=target_sample_rate, numcep=self.config.num_mfcc_coeffs)) # Aligns the MFCC features matrices using FastDTW. source_mfcc_features, target_mfcc_features = get_dtw_series(source_mfcc_features, target_mfcc_features) # Corrupt the input (source) MFCC features source_mfcc_features_corrupted = corrupt_input(source_mfcc_features, corr_frac=self.config.corr_frac, corr_type=self.config.corr_type) # Pads the MFCC feature matrices (rows) to length config.max_num_frames source_padded_frames, source_mask = pad_sequence(source_mfcc_features, config.max_num_frames) source_padded_frames_corrupted, _ = pad_sequence(source_mfcc_features_corrupted, config.max_num_frames) target_padded_frames, target_mask = pad_sequence(target_mfcc_features, config.max_num_frames) inputs.append(source_padded_frames) inputs_corrupted.append(source_padded_frames_corrupted)
def preprocess_data(self, config, SOURCE_DIR, TARGET_DIR=None): """Processes the training data and returns MFCC vectors for all of them. Args: config: the Config object with various parameters specified Returns: train_data: A list of tuples, one for each training example: (accent 1 padded MFCC frames, accent 1 mask) train_labels: A list of tuples, one for each training example: (accent 2 padded MFCC frames, accent 2 mask) """ inputs = [] labels = [] index = 0 if TARGET_DIR: for source_fname, target_fname in zip(os.listdir(SOURCE_DIR), os.listdir(TARGET_DIR)): #if index >= 100: # break #index += 1 if source_fname == '.DS_Store' or target_fname == '.DS_Store': continue (source_sample_rate, source_wav_data) = wav.read(SOURCE_DIR + source_fname) (target_sample_rate, target_wav_data) = wav.read(TARGET_DIR + target_fname) source_mfcc_features = np.array( mfcc(source_wav_data, samplerate=source_sample_rate, numcep=self.config.num_mfcc_coeffs, nfilt=self.config.num_filters, winlen=self.config.window_len, winstep=self.config.window_step)) target_mfcc_features = np.array( mfcc(target_wav_data, samplerate=target_sample_rate, numcep=self.config.num_mfcc_coeffs, nfilt=self.config.num_filters, winlen=self.config.window_len, winstep=self.config.window_step)) # Aligns the MFCC features matrices using FastDTW. source_mfcc_features, target_mfcc_features = get_dtw_series( source_mfcc_features, target_mfcc_features) # Pads the MFCC feature matrices (rows) to length config.max_num_frames source_padded_frames = self.pad_sequence(source_mfcc_features) target_padded_frames = self.pad_sequence(target_mfcc_features) inputs.append(source_padded_frames) inputs.append(target_padded_frames) labels.append(0) labels.append(1) else: for source_fname in os.listdir(SOURCE_DIR): #if index >= 100: # break #index += 1 if source_fname == '.DS_Store': continue (source_sample_rate, source_wav_data) = wav.read(SOURCE_DIR + source_fname) source_mfcc_features = np.array( mfcc(source_wav_data, samplerate=source_sample_rate, numcep=self.config.num_mfcc_coeffs, nfilt=self.config.num_filters, winlen=self.config.window_len, winstep=self.config.window_step)) # Aligns the MFCC features matrices using FastDTW. #source_mfcc_features, _ = get_dtw_series(source_mfcc_features, source_mfcc_features) # Pads the MFCC feature matrices (rows) to length config.max_num_frames source_padded_frames = self.pad_sequence(source_mfcc_features) inputs.append(source_padded_frames) labels.append(1) return inputs, labels
def preprocess_data(self, config): """Processes the training data and returns MFCC vectors for all of them. Args: config: the Config object with various parameters specified Returns: train_data: A list of tuples, one for each training example: (accent 1 padded MFCC frames, accent 1 mask) train_labels: A list of tuples, one for each training example: (accent 2 padded MFCC frames, accent 2 mask) """ inputs = [] labels = [] input_masks = [] label_masks = [] SOURCE_DIR = '../data/cmu_arctic/scottish-english-male-awb/wav/' TARGET_DIR = '../data/cmu_arctic/us-english-male-bdl/wav/' #TARGET_DIR = '../data/cmu_arctic/scottish-english-male-awb/wav/' index = 0 for source_fname, target_fname in zip(os.listdir(SOURCE_DIR), os.listdir(TARGET_DIR)): if index >= 20: break index += 1 if source_fname == '.DS_Store' or target_fname == '.DS_Store': continue (source_sample_rate, source_wav_data) = wav.read(SOURCE_DIR + source_fname) (target_sample_rate, target_wav_data) = wav.read(TARGET_DIR + target_fname) source_mfcc_features = np.array( mfcc(source_wav_data, samplerate=source_sample_rate, numcep=self.config.num_mfcc_coeffs, nfilt=self.config.num_filters, winlen=self.config.window_len, winstep=self.config.window_step)) target_mfcc_features = np.array( mfcc(target_wav_data, samplerate=target_sample_rate, numcep=self.config.num_mfcc_coeffs, nfilt=self.config.num_filters, winlen=self.config.window_len, winstep=self.config.window_step)) # Aligns the MFCC features matrices using FastDTW. source_mfcc_features, target_mfcc_features = get_dtw_series( source_mfcc_features, target_mfcc_features) # Pads the MFCC feature matrices (rows) to length config.max_num_frames source_padded_frames, source_mask = pad_sequence( source_mfcc_features, config.max_num_frames) target_padded_frames, target_mask = pad_sequence( target_mfcc_features, config.max_num_frames) #if index < 20: # self.output_wave_file(source_padded_frames, filename='src' + str(index)) # self.output_wave_file(target_padded_frames, filename='tgt' + str(index)) #wav.write('source' + str(index) + '.wav', self.config.sample_rate, source_wav_data) #wav.write('target' + str(index) + '.wav', self.config.sample_rate, target_wav_data) #self.eng.soundsc(matlab.double(source_wav_data.tolist()), self.config.sample_rate, nargout=0) #self.eng.soundsc(matlab.double(target_wav_data.tolist()), self.config.sample_rate, nargout=0) #index += 1 inputs.append(source_padded_frames) input_masks.append(source_mask) labels.append(target_padded_frames) label_masks.append(target_mask) return inputs, input_masks, labels, label_masks