def run_atom_synth(self, file_id_list, synth_output, hparams): """ Reconstruct lf0, get mgc and bap data, and store all in files in self.synth_dir. """ # Get mgc, vuv and bap data either through a trained acoustic model or from data extracted from the audio. if hparams.synth_acoustic_model_path is None: full_output = self.load_extracted_audio_features( synth_output, hparams) else: self.logger.warning("This method is untested.") full_output = self.generate_audio_features(file_id_list, hparams) # Reconstruct lf0 from generated atoms and write it to synth output. recon_dict = self.get_recon_from_synth_output(synth_output, hparams) for id_name, lf0 in recon_dict.items(): full_sample = full_output[id_name] len_diff = len(full_sample) - len(lf0) full_sample = WorldFeatLabelGen.trim_end_sample(full_sample, int(len_diff / 2), reverse=True) full_sample = WorldFeatLabelGen.trim_end_sample( full_sample, len_diff - int(len_diff / 2)) vuv = np.ones(lf0.shape) vuv[lf0 <= math.log(WorldFeatLabelGen.f0_silence_threshold)] = 0.0 full_sample[:, hparams.num_coded_sps] = lf0 full_sample[:, hparams.num_coded_sps + 1] = vuv return full_output
def synthesize(self, id_list, synth_output, hparams): # Reconstruct lf0 from generated atoms and write it to synth output. # recon_dict = self.get_recon_from_synth_output(synth_output) full_output = dict() for id_name, labels in synth_output.items(): # Take lf0 and vuv from network output. lf0 = labels[:, 0] vuv = labels[:, 1] phrase_curve = self.OutputGen.get_phrase_curve(id_name) lf0 = lf0 + phrase_curve[:len(lf0)].squeeze() vuv[vuv < 0.5] = 0.0 vuv[vuv >= 0.5] = 1.0 # Get mgc, vuv and bap data either through a trained acoustic model or from data extracted from the audio. if hparams.synth_acoustic_model_path is None: world_dir = hparams.world_dir if hasattr(hparams, "world_dir") and hparams.world_dir is not None \ else os.path.join(self.OutputGen.dir_labels, self.dir_extracted_acoustic_features) full_sample: np.ndarray = WorldFeatLabelGen.load_sample( id_name, world_dir, add_deltas=False, num_coded_sps=hparams.num_coded_sps, num_bap=hparams.num_bap) # Load extracted data. len_diff = len(full_sample) - len(lf0) trim_front = len_diff // 2 trim_end = len_diff - trim_front full_sample = WorldFeatLabelGen.trim_end_sample( full_sample, trim_end) full_sample = WorldFeatLabelGen.trim_end_sample(full_sample, trim_front, reverse=True) else: raise NotImplementedError() # Overwrite lf0 and vuv by network output. full_sample[:, hparams.num_coded_sps] = lf0 full_sample[:, hparams.num_coded_sps + 1] = vuv # Fill a dictionary with the samples. full_output[id_name + "_E2E"] = full_sample # Run the merlin synthesizer Synthesiser.run_world_synth(full_output, hparams)
def synthesize(self, id_list, synth_output, hparams): """ Synthesise LF0 from atoms. The run_atom_synth function either loads the original acoustic features or uses an acoustic model to predict them. """ full_output = self.run_atom_synth(id_list, synth_output, hparams) for id_name, labels in full_output.items(): lf0 = labels[:, -3] lf0, _ = interpolate_lin(lf0) vuv = synth_output[id_name][:, 0, 1] len_diff = len(labels) - len(vuv) labels = WorldFeatLabelGen.trim_end_sample(labels, int(len_diff / 2), reverse=True) labels = WorldFeatLabelGen.trim_end_sample(labels, len_diff - int(len_diff / 2)) labels[:, -2] = vuv # Run the vocoder. ModelTrainer.synthesize(self, id_list, full_output, hparams)
def synthesize(self, id_list, synth_output, hparams): """Save output of model to .lf0 and (.vuv) files and call Merlin synth which reads those files.""" # Reconstruct lf0 from generated atoms and write it to synth output. # recon_dict = self.get_recon_from_synth_output(synth_output) full_output = dict() for id_name, labels in synth_output.items(): # Take lf0 and vuv from network output. lf0 = labels[:, 0] vuv = labels[:, 1] vuv[vuv < 0.5] = 0.0 vuv[vuv >= 0.5] = 1.0 # Get mgc, vuv and bap data either through a trained acoustic model or from data extracted from the audio. if hparams.synth_acoustic_model_path is None: world_dir = hparams.world_dir if hasattr(hparams, "world_dir") and hparams.world_dir is not None\ else os.path.realpath(os.path.join(hparams.out_dir, self.dir_extracted_acoustic_features)) full_sample: np.ndarray = WorldFeatLabelGen.load_sample( id_name, world_dir, add_deltas=False, num_coded_sps=hparams.num_coded_sps ) # Load extracted data. len_diff = len(full_sample) - len(lf0) trim_front = len_diff // 2 trim_end = len_diff - trim_front full_sample = WorldFeatLabelGen.trim_end_sample( full_sample, trim_end) full_sample = WorldFeatLabelGen.trim_end_sample(full_sample, trim_front, reverse=True) else: raise NotImplementedError() # Overwrite lf0 and vuv by network output. full_sample[:, hparams.num_coded_sps] = lf0 full_sample[:, hparams.num_coded_sps + 1] = vuv # Fill a dictionary with the samples. full_output[id_name + "_E2E_Phrase"] = full_sample # Run the vocoder. ModelTrainer.synthesize(self, id_list, full_output, hparams)
def synthesize(self, id_list, synth_output, hparams): """ Depending on hparams override the network output with the extracted features, then continue with normal synthesis pipeline. """ if hparams.synth_load_org_sp\ or hparams.synth_load_org_lf0\ or hparams.synth_load_org_vuv\ or hparams.synth_load_org_bap: for id_name in id_list: world_dir = hparams.world_dir if hasattr(hparams, "world_dir") and hparams.world_dir is not None\ else os.path.join(self.OutputGen.dir_labels, self.dir_extracted_acoustic_features) labels = WorldFeatLabelGen.load_sample( id_name, world_dir, num_coded_sps=hparams.num_coded_sps) len_diff = len(labels) - len(synth_output[id_name]) if len_diff > 0: labels = WorldFeatLabelGen.trim_end_sample(labels, int(len_diff / 2), reverse=True) labels = WorldFeatLabelGen.trim_end_sample( labels, len_diff - int(len_diff / 2)) if hparams.synth_load_org_sp: synth_output[ id_name][:len(labels), :self.OutputGen. num_coded_sps] = labels[:, :self.OutputGen. num_coded_sps] if hparams.synth_load_org_lf0: synth_output[id_name][:len(labels), -3] = labels[:, -3] if hparams.synth_load_org_vuv: synth_output[id_name][:len(labels), -2] = labels[:, -2] if hparams.synth_load_org_bap: synth_output[id_name][:len(labels), -1] = labels[:, -1] # Run the vocoder. ModelTrainer.synthesize(self, id_list, synth_output, hparams)
def gen_figure_from_output(self, id_name, labels, hidden, hparams): if labels.ndim < 2: labels = np.expand_dims(labels, axis=1) labels_post = self.OutputGen.postprocess_sample(labels, identify_peaks=True, peak_range=100) lf0 = self.OutputGen.labels_to_lf0(labels_post, hparams.k) lf0, vuv = interpolate_lin(lf0) vuv = vuv.astype(np.bool) # Load original lf0 and vuv. world_dir = hparams.world_dir if hasattr(hparams, "world_dir") and hparams.world_dir is not None\ else os.path.join(self.OutputGen.dir_labels, self.dir_extracted_acoustic_features) org_labels = WorldFeatLabelGen.load_sample( id_name, world_dir, num_coded_sps=hparams.num_coded_sps) _, original_lf0, original_vuv, _ = WorldFeatLabelGen.convert_to_world_features( org_labels, num_coded_sps=hparams.num_coded_sps) original_lf0, _ = interpolate_lin(original_lf0) original_vuv = original_vuv.astype(np.bool) phrase_curve = np.fromfile(os.path.join( self.OutputGen.dir_labels, id_name + self.OutputGen.ext_phrase), dtype=np.float32).reshape(-1, 1) original_lf0 -= phrase_curve len_diff = len(original_lf0) - len(lf0) original_lf0 = WorldFeatLabelGen.trim_end_sample( original_lf0, int(len_diff / 2.0)) original_lf0 = WorldFeatLabelGen.trim_end_sample(original_lf0, int(len_diff / 2.0) + 1, reverse=True) org_labels = self.OutputGen.load_sample(id_name, self.OutputGen.dir_labels, len(hparams.thetas)) org_labels = self.OutputGen.trim_end_sample(org_labels, int(len_diff / 2.0)) org_labels = self.OutputGen.trim_end_sample(org_labels, int(len_diff / 2.0) + 1, reverse=True) org_atoms = self.OutputGen.labels_to_atoms( org_labels, k=hparams.k, frame_size=hparams.frame_size_ms) # Get a data plotter. net_name = os.path.basename(hparams.model_name) filename = str(os.path.join(hparams.out_dir, id_name + '.' + net_name)) plotter = DataPlotter() plotter.set_title(id_name + " - " + net_name) graphs_output = list() grid_idx = 0 for idx in reversed(range(labels.shape[1])): graphs_output.append( (labels[:, idx], r'$\theta$=' + "{0:.3f}".format(hparams.thetas[idx]))) plotter.set_label(grid_idx=grid_idx, xlabel='frames [' + str(hparams.frame_size_ms) + ' ms]', ylabel='NN output') plotter.set_data_list(grid_idx=grid_idx, data_list=graphs_output) # plotter.set_lim(grid_idx=0, ymin=-1.8, ymax=1.8) grid_idx += 1 graphs_peaks = list() for idx in reversed(range(labels_post.shape[1])): graphs_peaks.append((labels_post[:, idx, 0], )) plotter.set_label(grid_idx=grid_idx, xlabel='frames [' + str(hparams.frame_size_ms) + ' ms]', ylabel='NN post-processed') plotter.set_data_list(grid_idx=grid_idx, data_list=graphs_peaks) plotter.set_area_list(grid_idx=grid_idx, area_list=[(np.invert(vuv), '0.8', 1.0)]) plotter.set_lim(grid_idx=grid_idx, ymin=-1.8, ymax=1.8) grid_idx += 1 graphs_target = list() for idx in reversed(range(org_labels.shape[1])): graphs_target.append((org_labels[:, idx, 0], )) plotter.set_label(grid_idx=grid_idx, xlabel='frames [' + str(hparams.frame_size_ms) + ' ms]', ylabel='target') plotter.set_data_list(grid_idx=grid_idx, data_list=graphs_target) plotter.set_area_list(grid_idx=grid_idx, area_list=[(np.invert(original_vuv), '0.8', 1.0) ]) plotter.set_lim(grid_idx=grid_idx, ymin=-1.8, ymax=1.8) grid_idx += 1 output_atoms = AtomLabelGen.labels_to_atoms( labels_post, hparams.k, hparams.frame_size_ms, amp_threshold=hparams.min_atom_amp) wcad_lf0 = AtomLabelGen.atoms_to_lf0(org_atoms, len(labels)) output_lf0 = AtomLabelGen.atoms_to_lf0(output_atoms, len(labels)) graphs_lf0 = list() graphs_lf0.append((wcad_lf0, "wcad lf0")) graphs_lf0.append((original_lf0, "org lf0")) graphs_lf0.append((output_lf0, "predicted lf0")) plotter.set_data_list(grid_idx=grid_idx, data_list=graphs_lf0) plotter.set_area_list(grid_idx=grid_idx, area_list=[(np.invert(original_vuv), '0.8', 1.0) ]) plotter.set_label(grid_idx=grid_idx, xlabel='frames [' + str(hparams.frame_size_ms) + ' ms]', ylabel='lf0') amp_lim = max(np.max(np.abs(wcad_lf0)), np.max( np.abs(output_lf0))) * 1.1 plotter.set_lim(grid_idx=grid_idx, ymin=-amp_lim, ymax=amp_lim) plotter.set_linestyles(grid_idx=grid_idx, linestyles=[':', '--', '-']) # plotter.set_lim(xmin=300, xmax=1100) plotter.gen_plot() plotter.save_to_file(filename + ".BASE" + hparams.gen_figure_ext)
def gen_figure_phrase(self, hparams, ids_input): id_list = ModelTrainer._input_to_str_list(ids_input) model_output, model_output_post = self._forward_batched( hparams, id_list, hparams.batch_size_gen_figure, synth=False, benchmark=False, gen_figure=False) for id_name, outputs_post in model_output_post.items(): if outputs_post.ndim < 2: outputs_post = np.expand_dims(outputs_post, axis=1) lf0 = outputs_post[:, 0] output_lf0, _ = interpolate_lin(lf0) output_vuv = outputs_post[:, 1] output_vuv[output_vuv < 0.5] = 0.0 output_vuv[output_vuv >= 0.5] = 1.0 output_vuv = output_vuv.astype(np.bool) # Load original lf0 and vuv. world_dir = hparams.world_dir if hasattr(hparams, "world_dir") and hparams.world_dir is not None\ else os.path.join(hparams.out_dir, self.dir_extracted_acoustic_features) org_labels = WorldFeatLabelGen.load_sample( id_name, world_dir, num_coded_sps=hparams.num_coded_sps, num_bap=hparams.num_bap)[:len(output_lf0)] _, original_lf0, original_vuv, _ = WorldFeatLabelGen.convert_to_world_features( org_labels, num_coded_sps=hparams.num_coded_sps, num_bap=hparams.num_bap) original_lf0, _ = interpolate_lin(original_lf0) original_vuv = original_vuv.astype(np.bool) phrase_curve = np.fromfile(os.path.join( self.flat_trainer.atom_trainer.OutputGen.dir_labels, id_name + self.OutputGen.ext_phrase), dtype=np.float32).reshape( -1, 1)[:len(original_lf0)] f0_mse = (np.exp(original_lf0.squeeze(-1)) - np.exp(phrase_curve.squeeze(-1)))**2 f0_rmse = math.sqrt( (f0_mse * original_vuv[:len(output_lf0)]).sum() / original_vuv[:len(output_lf0)].sum()) self.logger.info("RMSE of {} phrase curve: {} Hz.".format( id_name, f0_rmse)) len_diff = len(original_lf0) - len(lf0) original_lf0 = WorldFeatLabelGen.trim_end_sample( original_lf0, int(len_diff / 2.0)) original_lf0 = WorldFeatLabelGen.trim_end_sample( original_lf0, int(len_diff / 2.0) + 1, reverse=True) # Get a data plotter. net_name = os.path.basename(hparams.model_name) filename = str( os.path.join(hparams.out_dir, id_name + '.' + net_name)) plotter = DataPlotter() # plotter.set_title(id_name + " - " + net_name) grid_idx = 0 graphs_lf0 = list() graphs_lf0.append((original_lf0, "Original")) graphs_lf0.append((phrase_curve, "Predicted")) plotter.set_data_list(grid_idx=grid_idx, data_list=graphs_lf0) plotter.set_area_list(grid_idx=grid_idx, area_list=[(np.invert(original_vuv), '0.8', 1.0, 'Reference unvoiced')]) plotter.set_label(grid_idx=grid_idx, xlabel='frames [' + str(hparams.frame_size_ms) + ' ms]', ylabel='LF0') # amp_lim = max(np.max(np.abs(wcad_lf0)), np.max(np.abs(output_lf0))) * 1.1 # plotter.set_lim(grid_idx=grid_idx, ymin=-amp_lim, ymax=amp_lim) plotter.set_lim(grid_idx=grid_idx, ymin=4.2, ymax=5.4) # plotter.set_linestyles(grid_idx=grid_idx, linestyles=[':', '--', '-']) # plotter.set_lim(xmin=300, xmax=1100) plotter.gen_plot() plotter.save_to_file(filename + ".PHRASE" + hparams.gen_figure_ext)
def synthesize(self, data, hparams, id_list): """ Depending on hparams override the network output with the extracted features, then continue with normal synthesis pipeline. """ if hparams.has_value("synth_feature_names"): feature_names = hparams.synth_feature_names if type(feature_names) not in [list, tuple]: feature_names = (feature_names, ) else: feature_names = ("pred_acoustic_features", ) feature_names = list(next(iter(data.values())).keys()) self.logger.warning( "hparams.synth_feature_names is not defined, using {} instead." .format(feature_names)) self.logger.info("Synthesise from {}".format(", ".join(feature_names))) for id_name, features in data.items(): selected_features = list() for feature_name in feature_names: selected_features.append(features[feature_name]) selected_features = np.concatenate(selected_features, axis=1) data[id_name] = selected_features load_any_org_features = hparams.synth_load_org_sp\ or hparams.synth_load_org_lf0\ or hparams.synth_load_org_vuv\ or hparams.synth_load_org_bap if load_any_org_features: for id_name, labels in data.items(): assert hparams.has_value("world_dir"), \ "hparams.world_dir must be set for this operation." world_dir = hparams.world_dir org_labels = WorldFeatLabelGen.load_sample( id_name, world_dir, num_coded_sps=hparams.num_coded_sps, num_bap=hparams.num_bap) len_org = len(org_labels) len_diff = len_org - len(labels) if len_diff > 0: org_labels = WorldFeatLabelGen.trim_end_sample( org_labels, int(len_diff / 2), reverse=True) org_labels = WorldFeatLabelGen.trim_end_sample( org_labels, len_diff - int(len_diff / 2)) if hparams.synth_load_org_sp: data[id_name][:len_org, :hparams.num_coded_sps] = \ org_labels[:, :hparams.num_coded_sps] if hparams.synth_load_org_lf0: data[id_name][:len_org, -3] = org_labels[:, -3] if hparams.synth_load_org_vuv: data[id_name][:len_org, -2] = org_labels[:, -2] if hparams.synth_load_org_bap: if hparams.num_bap == 1: data[id_name][:len_org, -1] = org_labels[:, -1] else: data[id_name][:len_org, -hparams.num_bap:] = \ org_labels[:, -hparams.num_bap:] super().gen_waveform(id_list=id_list, data=data, hparams=hparams)