def test_init_create(self): hparams = self._get_hparams() hparams.out_dir = os.path.join( hparams.out_dir, "test_init_create") # Add function name to path. trainer = VTLNSpeakerAdaptionModelTrainer(self.dir_world_features, self.dir_question_labels, self.id_list, hparams.num_questions, hparams) trainer.init(hparams) shutil.rmtree(hparams.out_dir)
def test_save_load_equality(self): hparams = self._get_hparams() hparams.out_dir = os.path.join( hparams.out_dir, "test_save_load_equality") # Add function name to path. model_path = os.path.join(hparams.out_dir, "nn", "test_model.nn") # Create a new model and save it. total_epochs = 10 trainer = VTLNSpeakerAdaptionModelTrainer(self.dir_world_features, self.dir_question_labels, self.id_list, hparams.num_questions, hparams) trainer.init(hparams) trainer.model_handler.save_checkpoint(model_path, total_epochs) # Create a new model and test load. hparams.model_type = None trainer = VTLNSpeakerAdaptionModelTrainer(self.dir_world_features, self.dir_question_labels, self.id_list, hparams.num_questions, hparams) trainer.init(hparams) trainer.model_handler.load_checkpoint(model_path, hparams) model_copy_path = os.path.join(hparams.out_dir, "test_model_copy.nn") trainer.model_handler.save_checkpoint(model_copy_path, total_epochs) # self.assertTrue(filecmp.cmp(model_path, model_copy_path, False)) # This does not work. self.assertTrue(equal_checkpoint(model_path, model_copy_path), "Loaded and saved models are not the same.") shutil.rmtree(hparams.out_dir)
def test_init_load_prenet(self): hparams = self._get_hparams() hparams.out_dir = os.path.join( hparams.out_dir, "test_init_load_prenet") # Add function name to path. hparams.pre_net_model_path = os.path.join("integration", "fixtures", "test_model_in409_out67.nn") trainer = VTLNSpeakerAdaptionModelTrainer(self.dir_world_features, self.dir_question_labels, self.id_list, hparams.num_questions, hparams) trainer.init(hparams) shutil.rmtree(hparams.out_dir)
def test_benchmark(self): hparams = self._get_hparams() hparams.out_dir = os.path.join( hparams.out_dir, "test_benchmark") # Add function name to path. hparams.seed = 1 trainer = VTLNSpeakerAdaptionModelTrainer(self.dir_world_features, self.dir_question_labels, self.id_list, hparams.num_questions, hparams) trainer.init(hparams) scores = trainer.benchmark(hparams) numpy.testing.assert_almost_equal((9.401, 78.124, 0.609, 38.964), scores, 3, "Wrong benchmark score.") shutil.rmtree(hparams.out_dir)
def test_gen_figure(self): num_test_files = 2 hparams = self._get_hparams() hparams.out_dir = os.path.join( hparams.out_dir, "test_gen_figure") # Add function name to path hparams.pre_net_model_path = os.path.join("integration", "fixtures", "test_model_in409_out67.nn") trainer = VTLNSpeakerAdaptionModelTrainer(self.dir_world_features, self.dir_question_labels, self.id_list, hparams.num_questions, hparams) trainer.init(hparams) with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib") trainer.gen_figure(hparams, self.id_list[:num_test_files]) # Check number of created files. found_files = list([ name for name in os.listdir(hparams.out_dir) if os.path.isfile(os.path.join(hparams.out_dir, name)) and name.endswith(hparams.model_name + ".VTLN" + hparams.gen_figure_ext) ]) self.assertEqual( len(self.id_list[:num_test_files]), len(found_files), msg="Number of {} files in out_dir directory does not match.". format(hparams.gen_figure_ext)) shutil.rmtree(hparams.out_dir)
def test_train(self): hparams = self._get_hparams() hparams.out_dir = os.path.join( hparams.out_dir, "test_train") # Add function name to path. hparams.seed = 1234 hparams.use_best_as_final_model = False trainer = VTLNSpeakerAdaptionModelTrainer(self.dir_world_features, self.dir_question_labels, self.id_list, hparams.num_questions, hparams) trainer.init(hparams) _, all_loss_train, _ = trainer.train(hparams) # Training loss decreases? self.assertLess(all_loss_train[-1], all_loss_train[1 if hparams.start_with_test else 0], msg="Loss did not decrease over {} epochs".format( hparams.epochs)) shutil.rmtree(hparams.out_dir)
def _get_hparams(self): hparams = VTLNSpeakerAdaptionModelTrainer.create_hparams() # General parameters hparams.num_questions = 409 hparams.voice = "full" hparams.data_dir = os.path.realpath( os.path.join("integration", "fixtures", "database")) hparams.out_dir = os.path.join( os.path.dirname(os.path.realpath(__file__)), type(self).__name__) hparams.frame_size_ms = 5 hparams.num_coded_sps = 20 hparams.seed = 1 # Training parameters. hparams.epochs = 3 hparams.use_gpu = False hparams.model_type = "VTLN" hparams.model_name = "VTLN.nn" hparams.batch_size_train = 2 hparams.batch_size_val = 50 hparams.use_saved_learning_rate = True hparams.optimiser_args["lr"] = 0.001 hparams.model_name = "test_model.nn" hparams.epochs_per_checkpoint = 2 # hparams.pass_embs_to_pre_net = False hparams.num_speakers = 2 hparams.f_get_emb_index = (lambda id_name, length: numpy.zeros( (length, hparams.num_speakers)), ) hparams.pre_net_model_type = "RNNDYN-1_RELU_32-1_FC_67" hparams.pre_net_model_name = "pre-net.nn" return hparams
def main(): from idiaptts.src.model_trainers.vtln.VTLNSpeakerAdaptionModelTrainer import VTLNSpeakerAdaptionModelTrainer hparams = VTLNSpeakerAdaptionModelTrainer.create_hparams() hparams.use_gpu = False hparams.voice = "English" hparams.model_name = "WarpingLayerTest.nn" hparams.add_deltas = True hparams.num_coded_sps = 30 # hparams.num_questions = 505 hparams.num_questions = 425 hparams.out_dir = "experiments/" + hparams.voice + "/VTLNArtificiallyWarped/" hparams.data_dir = os.path.realpath("database") hparams.model_name = "warping_layer_test" hparams.synth_dir = hparams.out_dir batch_size = 2 dir_world_labels = os.path.join("experiments", hparams.voice, "WORLD") from idiaptts.src.data_preparation.world.WorldFeatLabelGen import WorldFeatLabelGen gen_in = WorldFeatLabelGen(dir_world_labels, add_deltas=hparams.add_deltas, num_coded_sps=hparams.num_coded_sps) gen_in.get_normalisation_params(gen_in.dir_labels) from idiaptts.src.model_trainers.AcousticModelTrainer import AcousticModelTrainer trainer = AcousticModelTrainer( "experiments/" + hparams.voice + "/WORLD", "experiments/" + hparams.voice + "/questions", "ignored", hparams.num_questions, hparams) sp_mean = gen_in.norm_params[0][:hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] sp_std_dev = gen_in.norm_params[1][:hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] wl = WarpingLayer((hparams.num_coded_sps, ), (hparams.num_coded_sps, ), hparams) wl.set_norm_params(sp_mean, sp_std_dev) # id_list = ["dorian/doriangray_16_00199"] id_list = ["p225/p225_051"] hparams.num_speakers = 1 t_benchmark = 0 for id_name in id_list: for idx, alpha in enumerate(np.arange(-0.15, 0.2, 0.05)): out_dir = hparams.out_dir + "alpha_{0:0.2f}/".format(alpha) makedirs_safe(out_dir) sample = WorldFeatLabelGen.load_sample( id_name, os.path.join("experiments", hparams.voice, "WORLD"), add_deltas=True, num_coded_sps=hparams.num_coded_sps) sample_pre = gen_in.preprocess_sample(sample) coded_sps = sample_pre[:, :hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] alpha_vec = np.ones((coded_sps.shape[0], 1)) * alpha coded_sps = coded_sps[:len(alpha_vec), None, ...].repeat( batch_size, 1) # Copy data in batch dimension. alpha_vec = alpha_vec[:, None, None].repeat( batch_size, 1) # Copy data in batch dimension. t_start = timer() mfcc_warped, (_, nn_alpha) = wl(torch.from_numpy(coded_sps), None, (len(coded_sps), ), (len(coded_sps), ), alphas=torch.from_numpy(alpha_vec)) mfcc_warped.sum().backward() t_benchmark += timer() - t_start assert ((mfcc_warped[:, 0] == mfcc_warped[:, 1]).all() ) # Compare results for cloned coded_sps within batch. if alpha == 0: assert ((mfcc_warped == coded_sps).all() ) # Compare results for no warping. sample_pre[:len(mfcc_warped), :hparams.num_coded_sps * ( 3 if hparams.add_deltas else 1)] = mfcc_warped[:, 0].detach() sample_post = gen_in.postprocess_sample(sample_pre) # Manually create samples without normalisation but with deltas. sample_pre = (sample_pre * gen_in.norm_params[1] + gen_in.norm_params[0]).astype(np.float32) if np.isnan(sample_pre).any(): raise ValueError( "Detected nan values in output features for {}.".format( id_name)) # Save warped features. makedirs_safe(os.path.dirname(os.path.join(out_dir, id_name))) sample_pre.tofile( os.path.join(out_dir, id_name + WorldFeatLabelGen.ext_deltas)) hparams.synth_dir = out_dir Synthesiser.run_world_synth({id_name: sample_post}, hparams) print("Process time for {} runs: {}".format( len(id_list) * idx, timedelta(seconds=t_benchmark)))
def main(): """Create samples with artificial alpha for each phoneme.""" from idiaptts.src.model_trainers.vtln.VTLNSpeakerAdaptionModelTrainer import VTLNSpeakerAdaptionModelTrainer hparams = VTLNSpeakerAdaptionModelTrainer.create_hparams() hparams.use_gpu = False hparams.voice = sys.argv[1] hparams.model_name = "WarpingLayerTest.nn" hparams.add_deltas = True hparams.num_coded_sps = 30 alpha_range = 0.2 num_phonemes = 70 num_random_alphas = 7 # num_random_alphas = 53 # Randomly pick alphas for each phoneme. np.random.seed(42) # phonemes_to_alpha_tensor = ((np.random.choice(np.random.rand(num_random_alphas), num_phonemes) - 0.5) * 2 * alpha_range) phonemes_to_alpha_tensor = ((np.random.rand(num_phonemes) - 0.5) * 2 * alpha_range) # hparams.num_questions = 505 hparams.num_questions = 609 # hparams.num_questions = 425 hparams.out_dir = os.path.join("experiments", hparams.voice, "WORLD_artificially_warped") hparams.data_dir = os.path.realpath("database") hparams.model_name = "warping_layer_test" hparams.synth_dir = hparams.out_dir dir_world_labels = os.path.join("experiments", hparams.voice, "WORLD") print( "Create artificially warped MGCs for {} in {} for {} questions, {} random alphas, and an alpha range of {}." .format(hparams.voice, hparams.out_dir, hparams.num_questions, len(np.unique(phonemes_to_alpha_tensor)), alpha_range)) from idiaptts.src.data_preparation.world.WorldFeatLabelGen import WorldFeatLabelGen gen_in = WorldFeatLabelGen(dir_world_labels, add_deltas=hparams.add_deltas, num_coded_sps=hparams.num_coded_sps) gen_in.get_normalisation_params(gen_in.dir_labels) from idiaptts.src.model_trainers.AcousticModelTrainer import AcousticModelTrainer trainer = AcousticModelTrainer( os.path.join("experiments", hparams.voice, "WORLD"), os.path.join("experiments", hparams.voice, "questions"), "ignored", hparams.num_questions, hparams) hparams.num_speakers = 1 speaker = "p276" num_synth_files = 5 # Number of files to synthesise to check warping manually. sp_mean = gen_in.norm_params[0][:hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] sp_std_dev = gen_in.norm_params[1][:hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] wl = WarpingLayer((hparams.num_coded_sps, ), (hparams.num_coded_sps, ), hparams) wl.set_norm_params(sp_mean, sp_std_dev) def _question_to_phoneme_index(questions): """Helper function to convert questions to their current phoneme index.""" if questions.shape[-1] == 505: # German question set. indices = np.arange(86, 347, 5, dtype=np.int) elif questions.shape[-1] == 425: # English radio question set. indices = np.arange(58, 107, dtype=np.int) elif questions.shape[-1] == 609: # English unilex question set. indices = np.arange(92, 162, dtype=np.int) else: raise NotImplementedError( "Unknown question set with {} questions.".format( questions.shape[-1])) return QuestionLabelGen.questions_to_phoneme_indices( questions, indices) # with open(os.path.join(hparams.data_dir, "file_id_list_{}_train.txt".format(hparams.voice))) as f: with open( os.path.join(hparams.data_dir, "file_id_list_{}_adapt.txt".format( hparams.voice))) as f: id_list = f.readlines() id_list[:] = [s.strip(' \t\n\r') for s in id_list if speaker in s] # Trim line endings in-place. out_dir = hparams.out_dir makedirs_safe(out_dir) makedirs_safe(os.path.join(out_dir, "cmp_mgc" + str(hparams.num_coded_sps))) t_benchmark = 0 org_to_warped_mcd = 0.0 for idx, id_name in enumerate(id_list): sample = WorldFeatLabelGen.load_sample( id_name, os.path.join("experiments", hparams.voice, "WORLD"), add_deltas=True, num_coded_sps=hparams.num_coded_sps) sample_pre = gen_in.preprocess_sample(sample) coded_sps = sample_pre[:, :hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] questions = QuestionLabelGen.load_sample( id_name, os.path.join("experiments", hparams.voice, "questions"), num_questions=hparams.num_questions) questions = questions[:len(coded_sps)] phoneme_indices = _question_to_phoneme_index(questions) alpha_vec = phonemes_to_alpha_tensor[phoneme_indices % len(phonemes_to_alpha_tensor), None] coded_sps = coded_sps[:len(alpha_vec), None, ...] # Create a batch dimension. alpha_vec = alpha_vec[:, None, None] # Create a batch and feature dimension. t_start = timer() mfcc_warped, (_, nn_alpha) = wl(torch.from_numpy(coded_sps), None, (len(coded_sps), ), (len(coded_sps), ), alphas=torch.from_numpy(alpha_vec)) t_benchmark += timer() - t_start sample_pre[:len(mfcc_warped), :hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] = mfcc_warped[:, 0].detach() sample_post = gen_in.postprocess_sample(sample_pre) # Manually create samples without normalisation but with deltas. sample_pre = (sample_pre * gen_in.norm_params[1] + gen_in.norm_params[0]).astype(np.float32) if np.isnan(sample_pre).any(): raise ValueError( "Detected nan values in output features for {}.".format( id_name)) # Compute error between warped version and original one. org_to_warped_mcd += metrics.melcd( sample[:, 0:hparams.num_coded_sps], sample_pre[:, 0:hparams.num_coded_sps]) # Save warped features. sample_pre.tofile( os.path.join( out_dir, "cmp_mgc" + str(hparams.num_coded_sps), os.path.basename(id_name + WorldFeatLabelGen.ext_deltas))) hparams.synth_dir = out_dir if idx < num_synth_files: # Only synthesize a few of samples. trainer.run_world_synth({id_name: sample_post}, hparams) print("Process time for {} warpings: {}. MCD caused by warping: {:.2f}". format(len(id_list), timedelta(seconds=t_benchmark), org_to_warped_mcd / len(id_list))) # Copy normalisation files which are necessary for training. for feature in ["_bap", "_lf0", "_mgc{}".format(hparams.num_coded_sps)]: shutil.copyfile( os.path.join( gen_in.dir_labels, gen_in.dir_deltas, MeanCovarianceExtractor.file_name_appendix + feature + ".bin"), os.path.join( out_dir, "cmp_mgc" + str(hparams.num_coded_sps), MeanCovarianceExtractor.file_name_appendix + feature + ".bin"))
def test_compare_to_recursive_matrix(self): """ Compare the element-wise computed gradient matrix with the recursively generate matrix for alphas in range(-alpha_range, alpha_range, precision). """ precision = 0.05 # Precision used for steps in that range. delta = 0.05 # Allowed delta of error. hparams = VTLNSpeakerAdaptionModelTrainer.create_hparams() hparams.out_dir = os.path.join( self.out_dir, "test_compare_to_recursive_matrix") # Add function name to path. hparams.num_speakers = 1 wl = WarpingLayer(10, 4, hparams) alpha_range = wl.alpha_range # Range of alpha to test in. assert (precision < 2 * alpha_range) # Precision must fit in range. for alpha_value in numpy.arange(-alpha_range, alpha_range + precision, precision): # Alpha value which receives the final gradient. alpha = torch.tensor(alpha_value, requires_grad=True).type(wl.computation_dtype) alpha_eps = alpha.repeat([100, 1]) # Test in batch mode. # Compute the warp matrix for each alpha. warp_matrix = wl.get_warp_matrix(alpha_eps) # Create the reference matrix recursively for the given alpha. ref_matrix = wl.gen_warp_matrix_recursively(alpha) # Compute the error. numpy.testing.assert_almost_equal(warp_matrix[10].detach().numpy(), ref_matrix.detach().numpy(), 3) dist = (warp_matrix[10] - ref_matrix).abs() max_error = (dist / (ref_matrix.abs() + 1e-6)).max() # error = dist.sum() self.assertLess( max_error, delta, msg="Max error between w_matrix_3d and recursive reference is" " {:.5f}% for alpha={:.2f}.".format(max_error * 100, alpha_value)) # Compute the gradient ratio error. ref_matrix.sum().backward() real_grad = torch.tensor(alpha.grad) alpha.grad.zero_() warp_matrix.sum().backward() approx_grad = alpha.grad / len(alpha_eps) dist_grad = (real_grad - approx_grad).abs() error_ratio = (dist_grad / real_grad.abs()) self.assertLess( error_ratio, delta, msg= "Gradient error between w_matrix_3d and recursive reference is " "{:.5f}% for alpha={:.2f}.".format(error_ratio * 100., alpha_value)) shutil.rmtree(hparams.out_dir, ignore_errors=True)
def main(): from idiaptts.src.model_trainers.vtln.VTLNSpeakerAdaptionModelTrainer import VTLNSpeakerAdaptionModelTrainer hparams = VTLNSpeakerAdaptionModelTrainer.create_hparams() hparams.use_gpu = False hparams.voice = "English" hparams.model_name = "AllPassWarpModelTest.nn" hparams.add_deltas = True hparams.num_coded_sps = 30 # hparams.num_questions = 505 hparams.num_questions = 425 hparams.out_dir = os.path.join("experiments", hparams.voice, "VTLNArtificiallyWarped") hparams.data_dir = os.path.realpath("database") hparams.model_name = "all_pass_warp_test" hparams.synth_dir = hparams.out_dir batch_size = 2 dir_world_labels = os.path.join("experiments", hparams.voice, "WORLD") # hparams.add_hparam("warp_matrix_size", hparams.num_coded_sps) hparams.alpha_ranges = [ 0.2, ] from idiaptts.src.data_preparation.world.WorldFeatLabelGen import WorldFeatLabelGen gen_in = WorldFeatLabelGen(dir_world_labels, add_deltas=hparams.add_deltas, num_coded_sps=hparams.num_coded_sps, num_bap=hparams.num_bap) gen_in.get_normalisation_params(gen_in.dir_labels) from idiaptts.src.model_trainers.AcousticModelTrainer import AcousticModelTrainer trainer = AcousticModelTrainer( "experiments/" + hparams.voice + "/WORLD", "experiments/" + hparams.voice + "/questions", "ignored", hparams.num_questions, hparams) sp_mean = gen_in.norm_params[0][:hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] sp_std_dev = gen_in.norm_params[1][:hparams.num_coded_sps * (3 if hparams.add_deltas else 1)] all_pass_warp_model = AllPassWarpModel((hparams.num_coded_sps, ), (hparams.num_coded_sps, ), hparams) all_pass_warp_model.set_norm_params(sp_mean, sp_std_dev) # id_list = ["dorian/doriangray_16_00199"] # id_list = ["p225/p225_051", "p277/p277_012", "p278/p278_012", "p279/p279_012"] id_list = ["p225/p225_051"] t_benchmark = 0 for id_name in id_list: sample = WorldFeatLabelGen.load_sample( id_name, os.path.join("experiments", hparams.voice, "WORLD"), add_deltas=True, num_coded_sps=hparams.num_coded_sps, num_bap=hparams.num_bap, sp_type=hparams.sp_type) sample_pre = gen_in.preprocess_sample(sample) coded_sps = sample_pre[:, :hparams.num_coded_sps * (3 if hparams.add_deltas else 1)].copy() coded_sps = coded_sps[:, None, ...].repeat(batch_size, 1) # Copy data in batch dimension. for idx, alpha in enumerate(np.arange(-0.2, 0.21, 0.05)): out_dir = os.path.join(hparams.out_dir, "alpha_{0:0.2f}".format(alpha)) makedirs_safe(out_dir) alpha_vec = np.ones((coded_sps.shape[0], 1)) * alpha alpha_vec = alpha_vec[:, None].repeat( batch_size, 1) # Copy data in batch dimension. t_start = timer() sp_warped, (_, nn_alpha) = all_pass_warp_model( torch.from_numpy(coded_sps.copy()), None, (len(coded_sps), ), (len(coded_sps), ), alphas=torch.tensor(alpha_vec, requires_grad=True)) sp_warped.sum().backward() t_benchmark += timer() - t_start # assert((mfcc_warped[:, 0] == mfcc_warped[:, 1]).all()) # Compare results for cloned coded_sps within batch. if np.isclose(alpha, 0): assert np.isclose( sp_warped.detach().cpu().numpy(), coded_sps).all() # Compare no warping results. sample_pre[:len(sp_warped), :hparams.num_coded_sps * ( 3 if hparams.add_deltas else 1)] = sp_warped[:, 0].detach() sample_post = gen_in.postprocess_sample(sample_pre, apply_mlpg=False) # Manually create samples without normalisation but with deltas. sample_pre_with_deltas = (sample_pre * gen_in.norm_params[1] + gen_in.norm_params[0]).astype(np.float32) if np.isnan(sample_pre_with_deltas).any(): raise ValueError( "Detected nan values in output features for {}.".format( id_name)) # Save warped features. makedirs_safe(os.path.dirname(os.path.join(out_dir, id_name))) sample_pre_with_deltas.tofile( os.path.join(out_dir, id_name + "." + WorldFeatLabelGen.ext_deltas)) hparams.synth_dir = out_dir # sample_no_deltas = WorldFeatLabelGen.convert_from_world_features(*WorldFeatLabelGen.convert_to_world_features(sample, contains_deltas=hparams.add_deltas, num_coded_sps=hparams.num_coded_sps, num_bap=hparams.num_bap)) Synthesiser.run_world_synth({id_name: sample_post}, hparams) print("Process time for {} runs: {}, average: {}".format( len(id_list) * idx, timedelta(seconds=t_benchmark), timedelta(seconds=t_benchmark) / (len(id_list) * idx)))