def create_NMs(tacotron2_params, decoder_infer=False): data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( **tacotron2_params["AudioToMelSpectrogramPreprocessor"]) text_embedding = nemo_tts.TextEmbedding( len(tacotron2_params["labels"]) + 3, **tacotron2_params["TextEmbedding"], # + 3 special chars ) t2_enc = nemo_tts.Tacotron2Encoder(**tacotron2_params["Tacotron2Encoder"]) if decoder_infer: t2_dec = nemo_tts.Tacotron2DecoderInfer( **tacotron2_params["Tacotron2Decoder"]) else: t2_dec = nemo_tts.Tacotron2Decoder( **tacotron2_params["Tacotron2Decoder"]) t2_postnet = nemo_tts.Tacotron2Postnet( **tacotron2_params["Tacotron2Postnet"]) t2_loss = nemo_tts.Tacotron2Loss(**tacotron2_params["Tacotron2Loss"]) makegatetarget = nemo_tts.MakeGate() total_weights = text_embedding.num_weights + t2_enc.num_weights + t2_dec.num_weights + t2_postnet.num_weights logging.info('================================') logging.info(f"Total number of parameters: {total_weights}") logging.info('================================') return ( data_preprocessor, text_embedding, t2_enc, t2_dec, t2_postnet, t2_loss, makegatetarget, )
def test_tacotron2_training(self): data_layer = nemo_asr.AudioToTextDataLayer( manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=4, ) preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor( window_size=None, window_stride=None, n_window_size=512, n_window_stride=128, normalize=None, preemph=None, dither=0, mag_power=1.0, pad_value=-11.52, ) text_embedding = nemo_tts.TextEmbedding(len(self.labels), 256) t2_enc = nemo_tts.Tacotron2Encoder( encoder_n_convolutions=2, encoder_kernel_size=5, encoder_embedding_dim=256, ) t2_dec = nemo_tts.Tacotron2Decoder( n_mel_channels=64, n_frames_per_step=1, encoder_embedding_dim=256, gate_threshold=0.5, prenet_dim=128, max_decoder_steps=1000, decoder_rnn_dim=512, p_decoder_dropout=0.1, p_attention_dropout=0.1, attention_rnn_dim=512, attention_dim=64, attention_location_n_filters=16, attention_location_kernel_size=15, ) t2_postnet = nemo_tts.Tacotron2Postnet( n_mel_channels=64, postnet_embedding_dim=256, postnet_kernel_size=5, postnet_n_convolutions=3, ) t2_loss = nemo_tts.Tacotron2Loss() makegatetarget = nemo_tts.MakeGate() # DAG audio, audio_len, transcript, transcript_len = data_layer() spec_target, spec_target_len = preprocessing(input_signal=audio, length=audio_len) transcript_embedded = text_embedding(char_phone=transcript) transcript_encoded = t2_enc( char_phone_embeddings=transcript_embedded, embedding_length=transcript_len, ) mel_decoder, gate, _ = t2_dec( char_phone_encoded=transcript_encoded, encoded_length=transcript_len, mel_target=spec_target, ) mel_postnet = t2_postnet(mel_input=mel_decoder) gate_target = makegatetarget(mel_target=spec_target, target_len=spec_target_len) loss_t = t2_loss( mel_out=mel_decoder, mel_out_postnet=mel_postnet, gate_out=gate, mel_target=spec_target, gate_target=gate_target, target_len=spec_target_len, seq_len=audio_len, ) callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss_t], print_func=lambda x: logging.info(f'Train Loss: {str(x[0].item())}' ), ) # Instantiate an optimizer to perform `train` action optimizer = nemo.backends.pytorch.actions.PtActions() optimizer.train( [loss_t], callbacks=[callback], optimizer="sgd", optimization_params={ "num_epochs": 10, "lr": 0.0003 }, )
def test_tacotron2_training(self): """Integtaion test that instantiates a smaller Tacotron2 model and tests training with the sample asr data. Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss at the first step. """ data_layer = nemo_asr.AudioToTextDataLayer( manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=4 ) preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor( window_size=None, window_stride=None, n_window_size=512, n_window_stride=128, normalize=None, preemph=None, dither=0, mag_power=1.0, pad_value=-11.52, log_zero_guard_type="clamp", log_zero_guard_value=1e-05, ) text_embedding = nemo_tts.TextEmbedding(len(self.labels), 256) t2_enc = nemo_tts.Tacotron2Encoder(encoder_n_convolutions=2, encoder_kernel_size=5, encoder_embedding_dim=256) t2_dec = nemo_tts.Tacotron2Decoder( n_mel_channels=64, n_frames_per_step=1, encoder_embedding_dim=256, gate_threshold=0.5, prenet_dim=128, max_decoder_steps=1000, decoder_rnn_dim=512, p_decoder_dropout=0.1, p_attention_dropout=0.1, attention_rnn_dim=512, attention_dim=64, attention_location_n_filters=16, attention_location_kernel_size=15, ) t2_postnet = nemo_tts.Tacotron2Postnet( n_mel_channels=64, postnet_embedding_dim=256, postnet_kernel_size=5, postnet_n_convolutions=3 ) t2_loss = nemo_tts.Tacotron2Loss() makegatetarget = nemo_tts.MakeGate() # DAG audio, audio_len, transcript, transcript_len = data_layer() spec_target, spec_target_len = preprocessing(input_signal=audio, length=audio_len) transcript_embedded = text_embedding(char_phone=transcript) transcript_encoded = t2_enc(char_phone_embeddings=transcript_embedded, embedding_length=transcript_len) mel_decoder, gate, _ = t2_dec( char_phone_encoded=transcript_encoded, encoded_length=transcript_len, mel_target=spec_target ) mel_postnet = t2_postnet(mel_input=mel_decoder) gate_target = makegatetarget(mel_target=spec_target, target_len=spec_target_len) loss_t = t2_loss( mel_out=mel_decoder, mel_out_postnet=mel_postnet, gate_out=gate, mel_target=spec_target, gate_target=gate_target, target_len=spec_target_len, seq_len=audio_len, ) loss_list = [] callback = SimpleLossLoggerCallback( tensors=[loss_t], print_func=partial(self.print_and_log_loss, loss_log_list=loss_list), step_freq=1 ) # Instantiate an optimizer to perform `train` action optimizer = PtActions() optimizer.train( [loss_t], callbacks=[callback], optimizer="sgd", optimization_params={"max_steps": 3, "lr": 0.01} ) # Assert that training loss went down assert loss_list[-1] < loss_list[0]