def synth_phrase(self, file_id_list, hparams): # Create reference audio files containing only the vocoder degradation. self.logger.info("Synthesise phrase curve for [{0}].".format(", ".join( [id_name for id_name in file_id_list]))) # Create an empty dictionary which can be filled with extracted audio features. synth_output = dict() for id_name in file_id_list: synth_output[id_name] = None # Fill dictionary with extracted audio features. full_output = self.load_extracted_audio_features(synth_output, hparams) # Override the lf0 component by the phrase curve. for id_name in file_id_list: labels = full_output[id_name] phrase_curve = np.fromfile( os.path.join(self.OutputGen.dir_labels, id_name + self.OutputGen.ext_phrase), dtype=np.float32)[:len(full_output[id_name])] labels[:, -3] = phrase_curve[:len(labels)] # Add identifier to suffix. old_synth_file_suffix = hparams.synth_file_suffix hparams.synth_file_suffix += '_phrase' # Run the vocoder. ModelTrainer.synthesize(self, file_id_list, full_output, hparams) # Restore identifier. hparams.synth_file_suffix = old_synth_file_suffix
def test_split_return_values_torch(self): seq_length_output = numpy.array([10, 5]) output = torch.ones(seq_length_output.max(), 2, 4) with unittest.mock.patch.object(ModelTrainer.logger, "error") as mock_logger: with self.assertRaises(TypeError): ModelTrainer._split_return_values(output, seq_length_output, None, False) mock_logger.assert_called_with("No best model exists yet. Continue with the current one.")
def test_split_return_values(self): seq_length_output = numpy.array([10, 6, 8]) batch_size = 3 feature_dim = 50 output = numpy.empty((seq_length_output.max(), batch_size, feature_dim)) hidden1 = numpy.empty((seq_length_output.max(), batch_size, 2)) hidden2 = numpy.empty((seq_length_output.max(), batch_size, 4)) for idx in range(batch_size): output[:, idx] = idx hidden1[:, idx] = idx * 10 hidden2[:, idx] = idx * 100 hidden = (hidden1, hidden2) batch = (output, hidden) split_batch = ModelTrainer._split_return_values(batch, seq_length_output, None, False) for idx in range(batch_size): b = split_batch[idx] out = b[0] h = b[1] h1 = h[0] h2 = h[1] self.assertTrue((out == idx).all(), msg="Output of batch {} is wrong, expected was all values being {}.".format(idx, idx)) self.assertTrue((h1 == idx * 10).all(), msg="Hidden1 of batch {} is wrong, expected was all values being {}.".format(idx, idx * 10)) self.assertTrue((h2 == idx * 100).all(), msg="Hidden2 of batch {} is wrong, expected was all values being {}.".format(idx, idx * 100))
def test_input_to_str_list(self): # Tuple input but elements are not strings. out = ModelTrainer._input_to_str_list((121, 122)) self.assertEqual(["121", "122"], out) # Valid path to file id list. out = ModelTrainer._input_to_str_list(os.path.join("integration", "fixtures", "file_id_list.txt")) self.assertEqual(TestModelTrainer._get_id_list(), out) # Single input id. out = ModelTrainer._input_to_str_list("121") self.assertEqual(["121"], out) # Wrong input. with self.assertRaises(ValueError): ModelTrainer._input_to_str_list(numpy.array([1, 2]))
def test_embeddings_everywhere(self): hparams = ModelTrainer.create_hparams() num_emb = 3 emb_dim = 12 in_dim = 42 out_dim = 12 hparams.add_hparam("f_get_emb_index", [lambda x: 0]) hparams.model_type = "RNNDYN-{}x{}_EMB_(-1)-3_RELU_128-2_BiLSTM_32-1_FC_12".format( num_emb, emb_dim) model = ModelFactory.create(hparams.model_type, (in_dim, ), out_dim, hparams) self.assertEqual(1, len(model.emb_groups)) self.assertEqual(torch.Size([num_emb, emb_dim]), model.emb_groups[0].weight.shape) self.assertEqual(torch.Size([128, in_dim - 1 + emb_dim]), model[0].weight.shape) self.assertEqual(torch.Size([128, 128 + emb_dim]), model[1].weight.shape) self.assertEqual(torch.Size([32 * 4, 128 + emb_dim]), model[3].weight_ih_l0.shape) self.assertEqual(torch.Size([32 * 4, 32 * 2 + emb_dim]), model[4].weight_ih_l0_reverse.shape) pass
def create_hparams(hparams_string=None, verbose=False): hparams = ModelTrainer.create_hparams(hparams_string, verbose=False) hparams.add_hparams( thetas=None, # One initial theta value per filter. k=2, # Order of the impulse response of the atoms. min_atom_amp= 0.25, # Post-processing removes atoms with an absolute amplitude smaller than this. complex_poles=True, # Comples poles possible. phase_init=0.0, # Initial phase of the filters. vuv_loss_weight=1.0, # Weight of the VUV RMSE. L1_loss_weight=1.0, # Weight of the L1 loss on the spiking inputs. weight_unvoiced=0.5, # Weight on unvoiced frames. num_questions=None, # Dimension of the input questions. dist_window_size= 51, # Size of distribution around spikes when training the AtomModel. phrase_bias_init= 0.0, # Initial bias of neural filter, should be estimated mean of speaker's LF0. atom_model_path=None, # Path to load a pre-trained atom model from. hparams_atom= None, # Hyper-parameter container used in the AtomModelTrainer flat_model_path= None, # Path to load a pre-trained atom neural filter model from (without phrase curve). hparams_flat= None, # Hyper-parameter container used in the AtomNeuralFilterModelTrainer. ) if verbose: logging.info(hparams.get_debug_string()) return hparams
def test_save_load_equality(self): hparams = ModelTrainer.create_hparams() hparams.out_dir = os.path.join( self.out_dir, "test_save_load_equality") # Add function name to path. model_path = os.path.join(hparams.out_dir, "test_model.nn") # Create a new model and save it. dim_in, dim_out = 10, 4 total_epochs = 10 model_handler = ModelHandlerPyTorch() model_handler.model = torch.nn.Sequential( torch.nn.Linear(dim_in, dim_out)) model_handler.save_checkpoint(model_path, total_epochs) # Create a new model handler and test load save. hparams.model_type = None model_handler = ModelHandlerPyTorch() saved_total_epochs = model_handler.load_checkpoint(model_path, hparams) self.assertEqual(total_epochs, saved_total_epochs, msg="Saved and loaded total epochs do not match") model_copy_path = os.path.join(hparams.out_dir, "test_model_copy.nn") model_handler.save_checkpoint(model_copy_path, total_epochs) # self.assertTrue(filecmp.cmp(model_path, model_copy_path, False)) # This does not work. self.assertTrue(equal_checkpoint(model_path, model_copy_path), "Loaded and saved models are not the same.") shutil.rmtree(hparams.out_dir)
def _load_pre_net(self, hparams): from idiaptts.src.neural_networks.pytorch.ModelHandlerPyTorch import ModelHandlerPyTorch from idiaptts.src.model_trainers.ModelTrainer import ModelTrainer model_path = ModelTrainer.get_model_path(hparams) self.pre_net, *_ = ModelHandlerPyTorch.load_model(model_path, hparams, verbose=True)
def decollate_network_output(output, _, seq_lengths=None, permutation=None, batch_first=True): """Split output into LF0, V/UV and command signals. Return command signals as hidden state.""" # Split pre-net output (command signals). intern_amps, _ = ModelTrainer.split_batch(output[:, :, 2:], None, seq_lengths, permutation, batch_first) # Split final LF0, V/UV. output, _ = ModelTrainer.split_batch(output[:, :, :2], None, seq_lengths, permutation, batch_first) return output, intern_amps
def _get_trainer(self, hparams): dir_world_features = "integration/fixtures/WORLD" dir_question_labels = "integration/fixtures/questions" trainer = ModelTrainer(self.id_list, hparams) # Create datasets to work on. trainer.InputGen = QuestionLabelGen(dir_question_labels, hparams.num_questions) trainer.InputGen.get_normalisation_params(dir_question_labels) trainer.OutputGen = WorldFeatLabelGen( dir_world_features, num_coded_sps=hparams.num_coded_sps, add_deltas=True) trainer.OutputGen.get_normalisation_params(dir_world_features) trainer.dataset_train = LabelGensDataset(trainer.id_list_train, trainer.InputGen, trainer.OutputGen, hparams, match_lengths=True) trainer.dataset_val = LabelGensDataset(trainer.id_list_val, trainer.InputGen, trainer.OutputGen, hparams, match_lengths=True) trainer.loss_function = torch.nn.MSELoss(reduction='none') return trainer
def synthesize(self, id_list, synth_output, hparams): """ Synthesise LF0 from atoms. The run_atom_synth function either loads the original acoustic features or uses an acoustic model to predict them. """ full_output = self.run_atom_synth(id_list, synth_output, hparams) for id_name, labels in full_output.items(): lf0 = labels[:, -3] lf0, _ = interpolate_lin(lf0) vuv = synth_output[id_name][:, 0, 1] len_diff = len(labels) - len(vuv) labels = WorldFeatLabelGen.trim_end_sample(labels, int(len_diff / 2), reverse=True) labels = WorldFeatLabelGen.trim_end_sample(labels, len_diff - int(len_diff / 2)) labels[:, -2] = vuv # Run the vocoder. ModelTrainer.synthesize(self, id_list, full_output, hparams)
def synthesize(self, id_list, synth_output, hparams): """ Depending on hparams override the network output with the extracted features, then continue with normal synthesis pipeline. """ if hparams.synth_load_org_sp\ or hparams.synth_load_org_lf0\ or hparams.synth_load_org_vuv\ or hparams.synth_load_org_bap: for id_name in id_list: world_dir = hparams.world_dir if hasattr(hparams, "world_dir") and hparams.world_dir is not None\ else os.path.join(self.OutputGen.dir_labels, self.dir_extracted_acoustic_features) labels = WorldFeatLabelGen.load_sample( id_name, world_dir, num_coded_sps=hparams.num_coded_sps) len_diff = len(labels) - len(synth_output[id_name]) if len_diff > 0: labels = WorldFeatLabelGen.trim_end_sample(labels, int(len_diff / 2), reverse=True) labels = WorldFeatLabelGen.trim_end_sample( labels, len_diff - int(len_diff / 2)) if hparams.synth_load_org_sp: synth_output[ id_name][:len(labels), :self.OutputGen. num_coded_sps] = labels[:, :self.OutputGen. num_coded_sps] if hparams.synth_load_org_lf0: synth_output[id_name][:len(labels), -3] = labels[:, -3] if hparams.synth_load_org_vuv: synth_output[id_name][:len(labels), -2] = labels[:, -2] if hparams.synth_load_org_bap: synth_output[id_name][:len(labels), -1] = labels[:, -1] # Run the vocoder. ModelTrainer.synthesize(self, id_list, synth_output, hparams)
def synthesize(self, id_list, synth_output, hparams): """Save output of model to .lf0 and (.vuv) files and call Merlin synth which reads those files.""" # Reconstruct lf0 from generated atoms and write it to synth output. # recon_dict = self.get_recon_from_synth_output(synth_output) full_output = dict() for id_name, labels in synth_output.items(): # Take lf0 and vuv from network output. lf0 = labels[:, 0] vuv = labels[:, 1] vuv[vuv < 0.5] = 0.0 vuv[vuv >= 0.5] = 1.0 # Get mgc, vuv and bap data either through a trained acoustic model or from data extracted from the audio. if hparams.synth_acoustic_model_path is None: world_dir = hparams.world_dir if hasattr(hparams, "world_dir") and hparams.world_dir is not None\ else os.path.realpath(os.path.join(hparams.out_dir, self.dir_extracted_acoustic_features)) full_sample: np.ndarray = WorldFeatLabelGen.load_sample( id_name, world_dir, add_deltas=False, num_coded_sps=hparams.num_coded_sps, num_bap=hparams.num_bap) # Load extracted data. len_diff = len(full_sample) - len(lf0) trim_front = len_diff // 2 trim_end = len_diff - trim_front full_sample = WorldFeatLabelGen.trim_end_sample( full_sample, trim_end) full_sample = WorldFeatLabelGen.trim_end_sample(full_sample, trim_front, reverse=True) else: raise NotImplementedError() # Overwrite lf0 and vuv by network output. full_sample[:, hparams.num_coded_sps] = lf0 full_sample[:, hparams.num_coded_sps + 1] = vuv # Fill a dictionary with the samples. full_output[id_name + "_E2E_Phrase"] = full_sample # Run the vocoder. ModelTrainer.synthesize(self, id_list, full_output, hparams)
def create_hparams(hparams_string=None, verbose=False): hparams = ModelTrainer.create_hparams(hparams_string, verbose=False) hparams.add_hparams(thetas=None, k=None, min_atom_amp=0.3, num_questions=None) if verbose: logging.info(hparams.get_debug_string()) return hparams
def create_hparams(hparams_string=None, verbose=False): hparams = ModelTrainer.create_hparams(hparams_string, verbose=False) hparams.add_hparams( # exclude_begin_and_end_silence=False, min_phoneme_length=50000, phoneme_label_type="HTK full" ) # Specifies the format in which the .lab files are stored. # Refer to PhonemeLabelGen.load_sample for a list of possible types. if verbose: logging.info(hparams.get_debug_string()) return hparams
def test_get_item(self): hparams = ModelTrainer.create_hparams() num_emb = 3 emb_dim = 12 in_dim = 42 out_dim = 12 hparams.add_hparam("f_get_emb_index", [lambda x: 0]) hparams.model_type = "RNNDYN-{}x{}_EMB_(0, 3, 5, 7)-5_RELU_128-3_BiLSTM_32-1_FC_12".format( num_emb, emb_dim) model = ModelFactory.create(hparams.model_type, (in_dim, ), out_dim, hparams) self.assertEqual(model.layer_groups[0][1], model[1]) self.assertEqual(model.layer_groups[1][0], model[3]) self.assertEqual(model.layer_groups[2][0], model[6])
def decollate_network_output(output, hidden, seq_lengths=None, permutation=None, batch_first=True): # Output of r9y9 Wavenet has batch first, thus output: B x C x T --transpose--> B x T x C output = np.transpose(output, (0, 2, 1)) if not batch_first: # output: B x T x C --transpose--> T x B x C output = np.transpose(output, (1, 0, 2)) return ModelTrainer.split_batch(output, hidden, seq_length_output=seq_lengths, permutation=permutation, batch_first=batch_first)
def _get_hparams(self): hparams = ModelTrainer.create_hparams() # General parameters hparams.add_hparam("num_questions", 409) hparams.epochs = 0 hparams.test_set_perc = 0.05 hparams.val_set_perc = 0.05 hparams.optimiser_args["lr"] = 0.02 hparams.seed = None # Remove the default seed. hparams.out_dir = os.path.join( os.path.dirname(os.path.realpath(__file__)), type(self).__name__) hparams.num_coded_sps = 20 # Training parameters. hparams.epochs = 0 hparams.model_name = "test_model.nn" return hparams
def test_embeddings(self): hparams = ModelTrainer.create_hparams() num_emb = 3 emb_dim = 12 in_dim = 42 # Contains the embedding index. out_dim = 12 hparams.variable_sequence_length_train = True hparams.add_hparam("f_get_emb_index", [lambda x: 0]) hparams.model_type = "RNNDYN-{}x{}_EMB_(0, 3, 5, 7)-5_RELU_128-3_BiLSTM_32-1_FC_12".format( num_emb, emb_dim) # hparams.model_type = "RNNDYN-{}x{}_EMB_(-1)-5_RELU_128-2_BiLSTM_32-1_FC_12".format(num_emb, emb_dim) model = ModelFactory.create(hparams.model_type, (in_dim, ), out_dim, hparams) self.assertEqual(1, len(model.emb_groups)) self.assertEqual(torch.Size([num_emb, emb_dim]), model.emb_groups[0].weight.shape) self.assertEqual(torch.Size([128, in_dim - 1 + emb_dim]), model[0].weight.shape) self.assertEqual(torch.Size([128, 128]), model[2].weight.shape) self.assertEqual(torch.Size([128, 128 + emb_dim]), model[3].weight.shape) self.assertEqual(torch.Size([32 * 4, 128 + emb_dim]), model[5].weight_ih_l0.shape) self.assertEqual(torch.Size([32 * 4, 32 * 2 + emb_dim]), model[7].weight_ih_l0_reverse.shape) seq_length = torch.tensor((100, 75), dtype=torch.long) batch_size = 2 test_input = torch.ones([seq_length[0], batch_size, in_dim]) model.init_hidden(batch_size) output = model(test_input, None, seq_length, seq_length[0]) self.assertEqual(torch.Size([seq_length[0], batch_size, out_dim]), output[0].shape) seq_length = torch.tensor((100, ), dtype=torch.long) batch_size = 1 test_input = torch.ones([seq_length[0], batch_size, in_dim]) model.init_hidden(batch_size) output = model(test_input, None, seq_length, seq_length[0]) self.assertEqual(torch.Size([seq_length[0], batch_size, out_dim]), output[0].shape)
def create_hparams(hparams_string=None, verbose=False): """Create model hyper parameter container. Parse non default from given string.""" hparams = ModelTrainer.create_hparams(hparams_string, verbose=False) hparams.add_hparams( num_questions=None, question_file=None, # Used to add labels in plot. num_coded_sps=60, sp_type="mcep", add_deltas=True, synth_load_org_sp=False, synth_load_org_lf0=False, synth_load_org_vuv=False, synth_load_org_bap=False) if verbose: logging.info(hparams.get_debug_string()) return hparams
def gen_figure_phrase(self, hparams, ids_input): id_list = ModelTrainer._input_to_str_list(ids_input) model_output, model_output_post = self._forward_batched( hparams, id_list, hparams.batch_size_gen_figure, synth=False, benchmark=False, gen_figure=False) for id_name, outputs_post in model_output_post.items(): if outputs_post.ndim < 2: outputs_post = np.expand_dims(outputs_post, axis=1) lf0 = outputs_post[:, 0] output_lf0, _ = interpolate_lin(lf0) output_vuv = outputs_post[:, 1] output_vuv[output_vuv < 0.5] = 0.0 output_vuv[output_vuv >= 0.5] = 1.0 output_vuv = output_vuv.astype(np.bool) # Load original lf0 and vuv. world_dir = hparams.world_dir if hasattr(hparams, "world_dir") and hparams.world_dir is not None\ else os.path.join(hparams.out_dir, self.dir_extracted_acoustic_features) org_labels = WorldFeatLabelGen.load_sample( id_name, world_dir, num_coded_sps=hparams.num_coded_sps, num_bap=hparams.num_bap)[:len(output_lf0)] _, original_lf0, original_vuv, _ = WorldFeatLabelGen.convert_to_world_features( org_labels, num_coded_sps=hparams.num_coded_sps, num_bap=hparams.num_bap) original_lf0, _ = interpolate_lin(original_lf0) original_vuv = original_vuv.astype(np.bool) phrase_curve = np.fromfile(os.path.join( self.flat_trainer.atom_trainer.OutputGen.dir_labels, id_name + self.OutputGen.ext_phrase), dtype=np.float32).reshape( -1, 1)[:len(original_lf0)] f0_mse = (np.exp(original_lf0.squeeze(-1)) - np.exp(phrase_curve.squeeze(-1)))**2 f0_rmse = math.sqrt( (f0_mse * original_vuv[:len(output_lf0)]).sum() / original_vuv[:len(output_lf0)].sum()) self.logger.info("RMSE of {} phrase curve: {} Hz.".format( id_name, f0_rmse)) len_diff = len(original_lf0) - len(lf0) original_lf0 = WorldFeatLabelGen.trim_end_sample( original_lf0, int(len_diff / 2.0)) original_lf0 = WorldFeatLabelGen.trim_end_sample( original_lf0, int(len_diff / 2.0) + 1, reverse=True) # Get a data plotter. net_name = os.path.basename(hparams.model_name) filename = str( os.path.join(hparams.out_dir, id_name + '.' + net_name)) plotter = DataPlotter() # plotter.set_title(id_name + " - " + net_name) grid_idx = 0 graphs_lf0 = list() graphs_lf0.append((original_lf0, "Original")) graphs_lf0.append((phrase_curve, "Predicted")) plotter.set_data_list(grid_idx=grid_idx, data_list=graphs_lf0) plotter.set_area_list(grid_idx=grid_idx, area_list=[(np.invert(original_vuv), '0.8', 1.0, 'Reference unvoiced')]) plotter.set_label(grid_idx=grid_idx, xlabel='frames [' + str(hparams.frame_size_ms) + ' ms]', ylabel='LF0') # amp_lim = max(np.max(np.abs(wcad_lf0)), np.max(np.abs(output_lf0))) * 1.1 # plotter.set_lim(grid_idx=grid_idx, ymin=-amp_lim, ymax=amp_lim) plotter.set_lim(grid_idx=grid_idx, ymin=4.2, ymax=5.4) # plotter.set_linestyles(grid_idx=grid_idx, linestyles=[':', '--', '-']) # plotter.set_lim(xmin=300, xmax=1100) plotter.gen_plot() plotter.save_to_file(filename + ".PHRASE" + hparams.gen_figure_ext)
def create_hparams(hparams_string=None, verbose=False): """Create model hyper-parameters. Parse non-default from given string.""" hparams = ModelTrainer.create_hparams(hparams_string, verbose=False) hparams.synth_vocoder = "raw" hparams.add_hparams( batch_first=True, frame_rate_output_Hz=16000, mu=255, bit_depth=16, silence_threshold_quantized= None, # Beginning and end of audio below the threshold are trimmed. teacher_forcing_in_test=True, ema_decay=0.9999, # Model parameters. input_type="mulaw-quantize", hinge_regularizer= True, # Only used in MoL prediction (input_type="raw"). log_scale_min=float(np.log( 1e-14)), # Only used for mixture of logistic distributions. quantize_channels=256 ) # 256 for input type mulaw-quantize, otherwise 65536 if hparams.input_type == "mulaw-quantize": hparams.add_hparam("out_channels", hparams.quantize_channels) else: hparams.add_hparam("out_channels", 10 * 3) # num_mixtures * 3 (pi, mean, log_scale) hparams.add_hparams( layers=24, # 20 stacks=4, # 2 residual_channels=512, gate_channels=512, skip_out_channels=256, dropout=1 - 0.95, kernel_size=3, weight_normalization=True, use_cond=True, # Determines if conditioning is used. cin_channels=63, upsample_conditional_features=False, upsample_scales=[5, 4, 2]) if hparams.upsample_conditional_features: hparams.len_in_out_multiplier = reduce(mul, hparams.upsample_scales, 1) else: hparams.len_in_out_multiplier = 1 hparams.add_hparams(freq_axis_kernel_size=3, gin_channels=-1, n_speakers=1, use_speaker_embedding=False, sp_type="mcep", load_sp=True, load_lf0=True, load_vuv=True, load_bap=True) if verbose: logging.info(hparams.get_debug_string()) return hparams