def synthesize(self, id_list, synth_output, hparams): """This method should be overwritten by sub classes.""" # Create lf0 from atoms of output and get other acoustic features either by loading the original labels or by # generating them with the model at hparams.synth_acoustic_model_path. full_output = self.run_atom_synth(id_list, synth_output, hparams) # Run the WORLD synthesizer. Synthesiser.run_world_synth(full_output, hparams)
def synthesize(self, file_id_list, synth_output, hparams): # Create speaker subdirectories if necessary. for id_name in file_id_list: path_split = os.path.split(id_name) if len(path_split) > 2: makedirs_safe(os.path.join(hparams.synth_dir, *path_split[:-1])) if hparams.synth_vocoder == "WORLD": Synthesiser.run_world_synth(synth_output, hparams) # elif hparams.synth_vocoder == "STRAIGHT": # Add further vocoders here. elif hparams.synth_vocoder == "r9y9wavenet_mulaw_16k_world_feats_English": Synthesiser.run_r9y9wavenet_mulaw_world_feats_synth( synth_output, hparams) elif hparams.synth_vocoder == "raw": # The features in the synth_output dictionary are raw waveforms and can be written directly to the file. Synthesiser.run_raw_synth(synth_output, hparams) elif hparams.synth_vocoder == "80_SSRN_English_GL": # Use a pre-trained spectrogram super resolution network for English and Griffin-Lim. # The features in the synth_output should be mfbanks. raise NotImplementedError() # TODO elif hparams.synth_vocoder == "r9y9wavenet": # Synthesise with a pre-trained r9y9 WaveNet. The hyper-parameters have to match the model. Synthesiser.run_wavenet_vocoder(synth_output, hparams)
def synthesize(self, id_list, synth_output, hparams): # Reconstruct lf0 from generated atoms and write it to synth output. # recon_dict = self.get_recon_from_synth_output(synth_output) full_output = dict() for id_name, labels in synth_output.items(): # Take lf0 and vuv from network output. lf0 = labels[:, 0] vuv = labels[:, 1] phrase_curve = self.OutputGen.get_phrase_curve(id_name) lf0 = lf0 + phrase_curve[:len(lf0)].squeeze() vuv[vuv < 0.5] = 0.0 vuv[vuv >= 0.5] = 1.0 # Get mgc, vuv and bap data either through a trained acoustic model or from data extracted from the audio. if hparams.synth_acoustic_model_path is None: world_dir = hparams.world_dir if hasattr(hparams, "world_dir") and hparams.world_dir is not None \ else os.path.join(self.OutputGen.dir_labels, self.dir_extracted_acoustic_features) full_sample: np.ndarray = WorldFeatLabelGen.load_sample( id_name, world_dir, add_deltas=False, num_coded_sps=hparams.num_coded_sps, num_bap=hparams.num_bap) # Load extracted data. len_diff = len(full_sample) - len(lf0) trim_front = len_diff // 2 trim_end = len_diff - trim_front full_sample = WorldFeatLabelGen.trim_end_sample( full_sample, trim_end) full_sample = WorldFeatLabelGen.trim_end_sample(full_sample, trim_front, reverse=True) else: raise NotImplementedError() # Overwrite lf0 and vuv by network output. full_sample[:, hparams.num_coded_sps] = lf0 full_sample[:, hparams.num_coded_sps + 1] = vuv # Fill a dictionary with the samples. full_output[id_name + "_E2E"] = full_sample # Run the merlin synthesizer Synthesiser.run_world_synth(full_output, hparams)
def synth_ref_wcad(self, file_id_list, hparams): synth_output = dict() # Load extracted atoms. for id_name in file_id_list: synth_output[id_name] = AtomLabelGen.load_sample( id_name, self.OutputGen.dir_labels, len(hparams.thetas)) full_output = self.run_atom_synth(file_id_list, synth_output, hparams) # Add identifier to suffix. old_synth_file_suffix = hparams.synth_file_suffix hparams.synth_file_suffix += "_wcad_ref" # Run the WORLD synthesizer. Synthesiser.run_world_synth(full_output, hparams) # Restore identifier. hparams.synth_file_suffix = old_synth_file_suffix
def main(): from idiaptts.src.model_trainers.vtln.VTLNSpeakerAdaptionModelTrainer import VTLNSpeakerAdaptionModelTrainer hparams = VTLNSpeakerAdaptionModelTrainer.create_hparams() hparams.use_gpu = False hparams.voice = "English" hparams.model_name = "WarpingLayerTest.nn" hparams.add_deltas = True hparams.num_coded_sps = 30 # hparams.num_questions = 505 hparams.num_questions = 425 hparams.out_dir = "experiments/" + hparams.voice + "/VTLNArtificiallyWarped/" hparams.data_dir = os.path.realpath("database") hparams.model_name = "warping_layer_test" hparams.synth_dir = hparams.out_dir batch_size = 2 dir_world_labels = os.path.join("experiments", hparams.voice, "WORLD") from idiaptts.src.data_preparation.world.WorldFeatLabelGen import WorldFeatLabelGen gen_in = WorldFeatLabelGen(dir_world_labels, add_deltas=hparams.add_deltas, num_coded_sps=hparams.num_coded_sps) gen_in.get_normalisation_params(gen_in.dir_labels) from idiaptts.src.model_trainers.AcousticModelTrainer import AcousticModelTrainer trainer = AcousticModelTrainer( "experiments/" + hparams.voice + "/WORLD", "experiments/" + hparams.voice + "/questions", "ignored", hparams.num_questions, hparams) sp_mean = gen_in.norm_params[0][:hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] sp_std_dev = gen_in.norm_params[1][:hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] wl = WarpingLayer((hparams.num_coded_sps, ), (hparams.num_coded_sps, ), hparams) wl.set_norm_params(sp_mean, sp_std_dev) # id_list = ["dorian/doriangray_16_00199"] id_list = ["p225/p225_051"] hparams.num_speakers = 1 t_benchmark = 0 for id_name in id_list: for idx, alpha in enumerate(np.arange(-0.15, 0.2, 0.05)): out_dir = hparams.out_dir + "alpha_{0:0.2f}/".format(alpha) makedirs_safe(out_dir) sample = WorldFeatLabelGen.load_sample( id_name, os.path.join("experiments", hparams.voice, "WORLD"), add_deltas=True, num_coded_sps=hparams.num_coded_sps) sample_pre = gen_in.preprocess_sample(sample) coded_sps = sample_pre[:, :hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] alpha_vec = np.ones((coded_sps.shape[0], 1)) * alpha coded_sps = coded_sps[:len(alpha_vec), None, ...].repeat( batch_size, 1) # Copy data in batch dimension. alpha_vec = alpha_vec[:, None, None].repeat( batch_size, 1) # Copy data in batch dimension. t_start = timer() mfcc_warped, (_, nn_alpha) = wl(torch.from_numpy(coded_sps), None, (len(coded_sps), ), (len(coded_sps), ), alphas=torch.from_numpy(alpha_vec)) mfcc_warped.sum().backward() t_benchmark += timer() - t_start assert ((mfcc_warped[:, 0] == mfcc_warped[:, 1]).all() ) # Compare results for cloned coded_sps within batch. if alpha == 0: assert ((mfcc_warped == coded_sps).all() ) # Compare results for no warping. sample_pre[:len(mfcc_warped), :hparams.num_coded_sps * ( 3 if hparams.add_deltas else 1)] = mfcc_warped[:, 0].detach() sample_post = gen_in.postprocess_sample(sample_pre) # Manually create samples without normalisation but with deltas. sample_pre = (sample_pre * gen_in.norm_params[1] + gen_in.norm_params[0]).astype(np.float32) if np.isnan(sample_pre).any(): raise ValueError( "Detected nan values in output features for {}.".format( id_name)) # Save warped features. makedirs_safe(os.path.dirname(os.path.join(out_dir, id_name))) sample_pre.tofile( os.path.join(out_dir, id_name + WorldFeatLabelGen.ext_deltas)) hparams.synth_dir = out_dir Synthesiser.run_world_synth({id_name: sample_post}, hparams) print("Process time for {} runs: {}".format( len(id_list) * idx, timedelta(seconds=t_benchmark)))
def main(): from idiaptts.src.model_trainers.vtln.VTLNSpeakerAdaptionModelTrainer import VTLNSpeakerAdaptionModelTrainer hparams = VTLNSpeakerAdaptionModelTrainer.create_hparams() hparams.use_gpu = False hparams.voice = "English" hparams.model_name = "AllPassWarpModelTest.nn" hparams.add_deltas = True hparams.num_coded_sps = 30 # hparams.num_questions = 505 hparams.num_questions = 425 hparams.out_dir = os.path.join("experiments", hparams.voice, "VTLNArtificiallyWarped") hparams.data_dir = os.path.realpath("database") hparams.model_name = "all_pass_warp_test" hparams.synth_dir = hparams.out_dir batch_size = 2 dir_world_labels = os.path.join("experiments", hparams.voice, "WORLD") # hparams.add_hparam("warp_matrix_size", hparams.num_coded_sps) hparams.alpha_ranges = [ 0.2, ] from idiaptts.src.data_preparation.world.WorldFeatLabelGen import WorldFeatLabelGen gen_in = WorldFeatLabelGen(dir_world_labels, add_deltas=hparams.add_deltas, num_coded_sps=hparams.num_coded_sps, num_bap=hparams.num_bap) gen_in.get_normalisation_params(gen_in.dir_labels) from idiaptts.src.model_trainers.AcousticModelTrainer import AcousticModelTrainer trainer = AcousticModelTrainer( "experiments/" + hparams.voice + "/WORLD", "experiments/" + hparams.voice + "/questions", "ignored", hparams.num_questions, hparams) sp_mean = gen_in.norm_params[0][:hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] sp_std_dev = gen_in.norm_params[1][:hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] all_pass_warp_model = AllPassWarpModel((hparams.num_coded_sps, ), (hparams.num_coded_sps, ), hparams) all_pass_warp_model.set_norm_params(sp_mean, sp_std_dev) # id_list = ["dorian/doriangray_16_00199"] # id_list = ["p225/p225_051", "p277/p277_012", "p278/p278_012", "p279/p279_012"] id_list = ["p225/p225_051"] t_benchmark = 0 for id_name in id_list: sample = WorldFeatLabelGen.load_sample( id_name, os.path.join("experiments", hparams.voice, "WORLD"), add_deltas=True, num_coded_sps=hparams.num_coded_sps, num_bap=hparams.num_bap, sp_type=hparams.sp_type) sample_pre = gen_in.preprocess_sample(sample) coded_sps = sample_pre[:, :hparams.num_coded_sps * (3 if hparams.add_deltas else 1)].copy() coded_sps = coded_sps[:, None, ...].repeat(batch_size, 1) # Copy data in batch dimension. for idx, alpha in enumerate(np.arange(-0.2, 0.21, 0.05)): out_dir = os.path.join(hparams.out_dir, "alpha_{0:0.2f}".format(alpha)) makedirs_safe(out_dir) alpha_vec = np.ones((coded_sps.shape[0], 1)) * alpha alpha_vec = alpha_vec[:, None].repeat( batch_size, 1) # Copy data in batch dimension. t_start = timer() sp_warped, (_, nn_alpha) = all_pass_warp_model( torch.from_numpy(coded_sps.copy()), None, (len(coded_sps), ), (len(coded_sps), ), alphas=torch.tensor(alpha_vec, requires_grad=True)) sp_warped.sum().backward() t_benchmark += timer() - t_start # assert((mfcc_warped[:, 0] == mfcc_warped[:, 1]).all()) # Compare results for cloned coded_sps within batch. if np.isclose(alpha, 0): assert np.isclose( sp_warped.detach().cpu().numpy(), coded_sps).all() # Compare no warping results. sample_pre[:len(sp_warped), :hparams.num_coded_sps * ( 3 if hparams.add_deltas else 1)] = sp_warped[:, 0].detach() sample_post = gen_in.postprocess_sample(sample_pre, apply_mlpg=False) # Manually create samples without normalisation but with deltas. sample_pre_with_deltas = (sample_pre * gen_in.norm_params[1] + gen_in.norm_params[0]).astype(np.float32) if np.isnan(sample_pre_with_deltas).any(): raise ValueError( "Detected nan values in output features for {}.".format( id_name)) # Save warped features. makedirs_safe(os.path.dirname(os.path.join(out_dir, id_name))) sample_pre_with_deltas.tofile( os.path.join(out_dir, id_name + "." + WorldFeatLabelGen.ext_deltas)) hparams.synth_dir = out_dir # sample_no_deltas = WorldFeatLabelGen.convert_from_world_features(*WorldFeatLabelGen.convert_to_world_features(sample, contains_deltas=hparams.add_deltas, num_coded_sps=hparams.num_coded_sps, num_bap=hparams.num_bap)) Synthesiser.run_world_synth({id_name: sample_post}, hparams) print("Process time for {} runs: {}, average: {}".format( len(id_list) * idx, timedelta(seconds=t_benchmark), timedelta(seconds=t_benchmark) / (len(id_list) * idx)))