def test_waveglow_training(self): """Integtaion test that instantiates a smaller WaveGlow model and tests training with the sample asr data. Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss at the first step. """ data_layer = nemo_tts.AudioDataLayer( manifest_filepath=self.manifest_filepath, n_segments=4000, batch_size=4, sample_rate=16000 ) preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor( window_size=None, window_stride=None, n_window_size=512, n_window_stride=128, normalize=None, preemph=None, dither=0, mag_power=1.0, pad_value=-11.52, ) waveglow = nemo_tts.WaveGlowNM( n_mel_channels=64, n_flows=6, n_group=4, n_early_every=4, n_early_size=2, n_wn_layers=4, n_wn_channels=256, wn_kernel_size=3, sample_rate=16000, ) waveglow_loss = nemo_tts.WaveGlowLoss(sample_rate=16000) # DAG audio, audio_len, = data_layer() spec_target, _ = preprocessing(input_signal=audio, length=audio_len) z, log_s_list, log_det_W_list = waveglow(mel_spectrogram=spec_target, audio=audio) loss_t = waveglow_loss(z=z, log_s_list=log_s_list, log_det_W_list=log_det_W_list) loss_list = [] callback = SimpleLossLoggerCallback( tensors=[loss_t], print_func=partial(self.print_and_log_loss, loss_log_list=loss_list), step_freq=1 ) # Instantiate an optimizer to perform `train` action optimizer = PtActions() optimizer.train( [loss_t], callbacks=[callback], optimizer="sgd", optimization_params={"max_steps": 3, "lr": 0.01} ) # Assert that training loss went down assert loss_list[-1] < loss_list[0]
with NeuralGraph(operation_mode=OperationMode.training) as training_graph: _, img, _, _, fine_target, _ = cifar100_dl() feat_map = image_encoder(inputs=img) res_img = reshaper(inputs=feat_map) logits = ffn(inputs=res_img) pred = nl(inputs=logits) loss = nll_loss(predictions=pred, targets=fine_target) # Set output - that output will be used for training. training_graph.outputs["loss"] = loss # Freeze the pretrained encoder. training_graph.freeze(["vgg16"]) logging.info(training_graph.summary()) # SimpleLossLoggerCallback will print loss values to console. callback = SimpleLossLoggerCallback( tensors=[loss], print_func=lambda x: logging.info(f'Training Loss: {str(x[0].item())}' )) # Invoke the "train" action. nf.train( training_graph=training_graph, callbacks=[callback], optimization_params={ "num_epochs": 10, "lr": 0.001 }, optimizer="adam", )
data.noncategorical_slot_status, ] steps_per_epoch = math.ceil( len(datalayer) / (args.train_batch_size * args.num_gpus)) return steps_per_epoch, tensors steps_per_epoch, train_tensors = create_pipeline(dataset_split='train') logging.info(f'Steps per epoch: {steps_per_epoch}') # Create trainer and execute training action train_callback = SimpleLossLoggerCallback( tensors=train_tensors, print_func=lambda x: logging.info("Loss: {:.8f}".format(x[0].item())), get_tb_values=lambda x: [["loss", x[0]]], tb_writer=nf.tb_writer, step_freq=args.loss_log_freq if args.loss_log_freq > 0 else steps_per_epoch, ) def get_eval_callback(eval_dataset): _, eval_tensors = create_pipeline(dataset_split=eval_dataset) eval_callback = EvaluatorCallback( eval_tensors=eval_tensors, user_iter_callback=lambda x, y: eval_iter_callback( x, y, schema_preprocessor, eval_dataset), user_epochs_done_callback=lambda x: eval_epochs_done_callback( x, args.task_name, eval_dataset,
is_training=True, num_gpus=args.num_gpus, ) eval_tensors, _, _, eval_data_layer = create_pipeline( num_samples=args.num_eval_samples, batch_size=args.batch_size, data_prefix=args.eval_file_prefix, is_training=False, num_gpus=args.num_gpus, ) # Create callbacks for train and eval modes train_callback = SimpleLossLoggerCallback( tensors=train_tensors, print_func=lambda x: logging.info(str(round(x[0].item(), 3))), tb_writer=nf.tb_writer, get_tb_values=lambda x: [["loss", x[0]]], step_freq=train_steps_per_epoch, ) eval_callback = nemo.core.EvaluatorCallback( eval_tensors=eval_tensors, user_iter_callback=lambda x, y: eval_iter_callback(x, y), user_epochs_done_callback=lambda x: eval_epochs_done_callback( x, intents_label_ids=data_desc.intents_label_ids, slots_label_ids=data_desc.slots_label_ids, graph_fold=f'{nf.work_dir}/graphs', normalize_cm=True, ), tb_writer=nf.tb_writer,
def test_stft_conv_training(self): """Integtaion test that instantiates a small Jasper model and tests training with the sample asr data. test_stft_conv_training tests the torch_stft path while test_jasper_training tests the torch.stft path inside of AudioToMelSpectrogramPreprocessor. Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss at the first step. Note: Training is done with batch gradient descent as opposed to stochastic gradient descent due to CTC loss """ with open( os.path.abspath( os.path.join(os.path.dirname(__file__), "../data/jasper_smaller.yaml"))) as file: jasper_model_definition = self.yaml.load(file) dl = nemo_asr.AudioToTextDataLayer( manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=30) pre_process_params = { 'frame_splicing': 1, 'features': 64, 'window_size': 0.02, 'n_fft': 512, 'dither': 1e-05, 'window': 'hann', 'sample_rate': 16000, 'normalize': 'per_feature', 'window_stride': 0.01, 'stft_conv': True, } preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor( **pre_process_params) jasper_encoder = nemo_asr.JasperEncoder( feat_in=jasper_model_definition[ 'AudioToMelSpectrogramPreprocessor']['features'], **jasper_model_definition['JasperEncoder'], ) jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len( self.labels)) ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels)) # DAG audio_signal, a_sig_length, transcript, transcript_len = dl() processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length) encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length) # logging.info(jasper_encoder) log_probs = jasper_decoder(encoder_output=encoded) loss = ctc_loss( log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len, ) loss_list = [] callback = SimpleLossLoggerCallback(tensors=[loss], print_func=partial( self.print_and_log_loss, loss_log_list=loss_list), step_freq=1) self.nf.train( [loss], callbacks=[callback], optimizer="sgd", optimization_params={ "max_steps": 3, "lr": 0.001 }, ) self.nf.reset_trainer() # Assert that training loss went down assert loss_list[-1] < loss_list[0]
def test_contextnet_ctc_training(self): """Integtaion test that instantiates a small ContextNet model and tests training with the sample asr data. Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss at the first step. Note: Training is done with batch gradient descent as opposed to stochastic gradient descent due to CTC loss Checks SE-block with fixed context size and global context, residual_mode='stride_add' and 'stride_last' flags """ with open( os.path.abspath( os.path.join(os.path.dirname(__file__), "../data/contextnet_32.yaml"))) as f: contextnet_model_definition = self.yaml.load(f) dl = nemo_asr.AudioToTextDataLayer( manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=30) pre_process_params = { 'frame_splicing': 1, 'features': 80, 'window_size': 0.025, 'n_fft': 512, 'dither': 1e-05, 'window': 'hann', 'sample_rate': 16000, 'normalize': 'per_feature', 'window_stride': 0.01, } preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor( **pre_process_params) spec_aug = nemo_asr.SpectrogramAugmentation( **contextnet_model_definition['SpectrogramAugmentation']) contextnet_encoder = nemo_asr.ContextNetEncoder( feat_in=contextnet_model_definition[ 'AudioToMelSpectrogramPreprocessor']['features'], **contextnet_model_definition['ContextNetEncoder'], ) contextnet_decoder = nemo_asr.ContextNetDecoderForCTC(feat_in=32, hidden_size=16, num_classes=len( self.labels)) ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels)) # DAG audio_signal, a_sig_length, transcript, transcript_len = dl() processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length) processed_signal = spec_aug(input_spec=processed_signal) encoded, encoded_len = contextnet_encoder( audio_signal=processed_signal, length=p_length) log_probs = contextnet_decoder(encoder_output=encoded) loss = ctc_loss( log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len, ) loss_list = [] callback = SimpleLossLoggerCallback(tensors=[loss], print_func=partial( self.print_and_log_loss, loss_log_list=loss_list), step_freq=1) self.nf.train( [loss], callbacks=[callback], optimizer="sgd", optimization_params={ "max_steps": 3, "lr": 0.001 }, ) self.nf.reset_trainer() # Assert that training loss went down assert loss_list[-1] < loss_list[0]
mode="train") # evaluation pipelines eval_tensors = create_pipeline(eval_examples, args.eval_batch_size, mode="eval") def print_loss(x): loss = x[0].item() logging.info("Training loss: {:.4f}".format(loss)) # callbacks callback_train = SimpleLossLoggerCallback( tensors=[train_tensors[0]], step_freq=100, print_func=print_loss, get_tb_values=lambda x: [["loss", x[0]]], tb_writer=nf.tb_writer, ) callbacks = [callback_train] # for eval_examples in args.eval_file_preprocessed: callback_eval = EvaluatorCallback( eval_tensors=eval_tensors, user_iter_callback=lambda x, y: eval_iter_callback( x, y, tokenizer), user_epochs_done_callback=eval_epochs_done_callback, eval_step=args.eval_freq, tb_writer=nf.tb_writer, )
def test_tacotron2_training(self): """Integtaion test that instantiates a smaller Tacotron2 model and tests training with the sample asr data. Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss at the first step. """ data_layer = nemo_asr.AudioToTextDataLayer( manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=4 ) preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor( window_size=None, window_stride=None, n_window_size=512, n_window_stride=128, normalize=None, preemph=None, dither=0, mag_power=1.0, pad_value=-11.52, log_zero_guard_type="clamp", log_zero_guard_value=1e-05, ) text_embedding = nemo_tts.TextEmbedding(len(self.labels), 256) t2_enc = nemo_tts.Tacotron2Encoder(encoder_n_convolutions=2, encoder_kernel_size=5, encoder_embedding_dim=256) t2_dec = nemo_tts.Tacotron2Decoder( n_mel_channels=64, n_frames_per_step=1, encoder_embedding_dim=256, gate_threshold=0.5, prenet_dim=128, max_decoder_steps=1000, decoder_rnn_dim=512, p_decoder_dropout=0.1, p_attention_dropout=0.1, attention_rnn_dim=512, attention_dim=64, attention_location_n_filters=16, attention_location_kernel_size=15, ) t2_postnet = nemo_tts.Tacotron2Postnet( n_mel_channels=64, postnet_embedding_dim=256, postnet_kernel_size=5, postnet_n_convolutions=3 ) t2_loss = nemo_tts.Tacotron2Loss() makegatetarget = nemo_tts.MakeGate() # DAG audio, audio_len, transcript, transcript_len = data_layer() spec_target, spec_target_len = preprocessing(input_signal=audio, length=audio_len) transcript_embedded = text_embedding(char_phone=transcript) transcript_encoded = t2_enc(char_phone_embeddings=transcript_embedded, embedding_length=transcript_len) mel_decoder, gate, _ = t2_dec( char_phone_encoded=transcript_encoded, encoded_length=transcript_len, mel_target=spec_target ) mel_postnet = t2_postnet(mel_input=mel_decoder) gate_target = makegatetarget(mel_target=spec_target, target_len=spec_target_len) loss_t = t2_loss( mel_out=mel_decoder, mel_out_postnet=mel_postnet, gate_out=gate, mel_target=spec_target, gate_target=gate_target, target_len=spec_target_len, seq_len=audio_len, ) loss_list = [] callback = SimpleLossLoggerCallback( tensors=[loss_t], print_func=partial(self.print_and_log_loss, loss_log_list=loss_list), step_freq=1 ) # Instantiate an optimizer to perform `train` action optimizer = PtActions() optimizer.train( [loss_t], callbacks=[callback], optimizer="sgd", optimization_params={"max_steps": 3, "lr": 0.01} ) # Assert that training loss went down assert loss_list[-1] < loss_list[0]
def test_fastspeech(self): """Integtaion test that instantiates a FastSpeech model and tests training with the sample asr data. Note instantiating the FastSpeech model additionally requires creating speech durations which additionally tests NeuralModuleFactory.infer(). Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss at the first step. """ data_layer = nemo_asr.AudioToTextDataLayer( manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=1, shuffle=False, sample_rate=16000, ) data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( window_size=None, window_stride=None, n_window_size=512, n_window_stride=128, normalize=None, preemph=None, dither=0, mag_power=1.0, pad_value=-11.52, pad_to=0, log_zero_guard_type="clamp", log_zero_guard_value=1e-05, ) data = data_layer() spec, spec_length = data_preprocessor(input_signal=data.audio_signal, length=data.a_sig_length) # Creates and saves durations as numpy arrays. durs_dir = pathlib.Path('tests/data/asr/durs') durs_dir.mkdir(exist_ok=True) result = self.nf.infer([data.transcripts, data.transcript_length, spec_length, spec]) k = -1 for text, text_len, mel_len, mel in zip(result[0], result[1], result[2], result[3]): text = text.cpu().numpy()[0][: text_len.cpu().numpy()[0]] dur = np.zeros(text.shape[0], dtype=np.long) dur_sum = mel_len.cpu().numpy()[0] + 1 # TODO: delete `+1` dur[0] = dur_sum - 4 dur[1] = 4 k += 1 np.save(durs_dir / f'{k}.npy', dur, allow_pickle=False) data_layer = nemo_tts.FastSpeechDataLayer( manifest_filepath=self.manifest_filepath, durs_dir=durs_dir, labels=self.labels, batch_size=4, sample_rate=16000, ) fastspeech = nemo_tts.FastSpeech( decoder_output_size=384, n_mels=64, max_seq_len=2048, word_vec_dim=384, encoder_n_layer=6, encoder_head=2, encoder_conv1d_filter_size=1536, decoder_n_layer=6, decoder_head=2, decoder_conv1d_filter_size=1536, fft_conv1d_kernel=3, fft_conv1d_padding=1, encoder_output_size=384, duration_predictor_filter_size=256, duration_predictor_kernel_size=3, dropout=0.1, alpha=1.0, n_src_vocab=len(self.labels), pad_id=0, ) loss = nemo_tts.FastSpeechLoss() data = data_layer() mel_true, _ = data_preprocessor(input_signal=data.audio, length=data.audio_len) mel_pred, dur_pred = fastspeech( text=data.text, text_pos=data.text_pos, mel_true=mel_true, dur_true=data.dur_true, ) loss_t = loss( mel_true=mel_true, mel_pred=mel_pred, dur_true=data.dur_true, dur_pred=dur_pred, text_pos=data.text_pos, ) loss_list = [] callback = SimpleLossLoggerCallback( tensors=[loss_t], print_func=partial(self.print_and_log_loss, loss_log_list=loss_list), step_freq=1 ) # Instantiate an optimizer to perform `train` action optimizer = PtActions() optimizer.train( [loss_t], callbacks=[callback], optimizer="sgd", optimization_params={"max_steps": 3, "lr": 0.0003} ) # Assert that training loss went down assert loss_list[-1] < loss_list[0]
) eval_tensors, _, _, eval_data_layer = create_pipeline( num_samples=args.num_eval_samples, batch_size=args.batch_size, data_prefix=args.eval_file_prefix, is_training=False, num_gpus=args.num_gpus, ) # Create callbacks for train and eval modes train_callback = SimpleLossLoggerCallback( tensors=train_tensors, print_func=lambda x: logging.info( f'Total Loss:{str(round(x[0].item(), 3))}, ' f'Intent Loss:{str(round(x[1].item(), 3))}, ' f'Slot Tagging Loss:{str(round(x[2].item(), 3))}'), tb_writer=nf.tb_writer, get_tb_values=lambda x: [["total_loss", x[0]], ["intent_loss", x[1]], ["slot_loss", x[2]]], step_freq=train_steps_per_epoch, ) eval_callback = nemo.core.EvaluatorCallback( eval_tensors=eval_tensors, user_iter_callback=lambda x, y: eval_iter_callback(x, y), user_epochs_done_callback=lambda x: eval_epochs_done_callback( x, intents_label_ids=data_desc.intents_label_ids, slots_label_ids=data_desc.slots_label_ids, graph_fold=f'{nf.work_dir}/graphs', normalize_cm=True,