def test_trim_silence(self): batch_size = 4 normal_dl = nemo_asr.AudioToTextDataLayer( # featurizer_config=self.featurizer_config, manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=batch_size, # placement=DeviceType.GPU, drop_last=True, shuffle=False, ) trimmed_dl = nemo_asr.AudioToTextDataLayer( # featurizer_config=self.featurizer_config, manifest_filepath=self.manifest_filepath, trim_silence=True, labels=self.labels, batch_size=batch_size, # placement=DeviceType.GPU, drop_last=True, shuffle=False, ) for norm, trim in zip(normal_dl.data_iterator, trimmed_dl.data_iterator): for point in range(batch_size): self.assertTrue(norm[1][point].data >= trim[1][point].data)
def wav_to_text(self, manifest, greedy=True): data_layer = nemo_asr.AudioToTextDataLayer(shuffle=False, manifest_filepath=manifest, labels=self.labels, batch_size=1) audio_signal, audio_signal_len, transcript, transcript_len = data_layer( ) log_probs, encoded_len = self.asr_model(input_signal=audio_signal, length=audio_signal_len) predictions = self.greedy_decoder(log_probs=log_probs) eval_tensors = [predictions] if self.ENABLE_NGRAM: print('Running with beam search') beam_predictions = self.beam_search_with_lm( log_probs=log_probs, log_probs_length=encoded_len) eval_tensors.append(beam_predictions) tensors = self.neural_factory.infer(tensors=eval_tensors) if greedy: prediction = post_process_predictions(tensors[0], self.labels) else: prediction = tensors[0][0][0][0][1] del data_layer del eval_tensors del beam_predictions del predictions del tensors del audio_signal, audio_signal_len, transcript, transcript_len del log_probs, encoded_len return prediction
def wrong(): with open("tests/data/jasper_smaller.yaml") as file: jasper_config = self.yaml.load(file) labels = jasper_config['labels'] data_layer = nemo_asr.AudioToTextDataLayer( manifest_filepath=self.manifest_filepath, labels=labels, batch_size=4, ) data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( **jasper_config['AudioToMelSpectrogramPreprocessor']) jasper_encoder = nemo_asr.JasperEncoder( feat_in=jasper_config['AudioToMelSpectrogramPreprocessor'] ['features'], **jasper_config['JasperEncoder'], ) jasper_decoder = nemo_asr.JasperDecoderForCTC( feat_in=1024, num_classes=len(labels)) # DAG definition ( audio_signal, audio_signal_len, transcript, transcript_len, ) = data_layer() processed_signal, processed_signal_len = data_preprocessor( input_signal=audio_signal, length=audio_signal_len) spec_augment = nemo_asr.SpectrogramAugmentation(rect_masks=5) aug_signal = spec_augment(input_spec=processed_signal) encoded, encoded_len = jasper_encoder(audio_signal=aug_signal, length=processed_signal_len) log_probs = jasper_decoder(encoder_output=processed_signal)
def test_quartznet_model_training(self): """Integtaion test that instantiates a small Jasper model and tests training with the sample asr data. test_stft_conv_training tests the torch_stft path while test_jasper_training tests the torch.stft path inside of AudioToMelSpectrogramPreprocessor. Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss at the first step. Note: Training is done with batch gradient descent as opposed to stochastic gradient descent due to CTC loss """ with open( os.path.abspath( os.path.join(os.path.dirname(__file__), "../../examples/asr/configs/jasper_an4.yaml")) ) as file: model_definition = self.yaml.load(file) dl = nemo_asr.AudioToTextDataLayer( manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=30) model = nemo_asr.models.ASRConvCTCModel( preprocessor_params=model_definition[ 'AudioToMelSpectrogramPreprocessor'], encoder_params=model_definition['JasperEncoder'], decoder_params=model_definition['JasperDecoderForCTC'], ) model.train() ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels)) # DAG audio_signal, a_sig_length, transcript, transcript_len = dl() log_probs, encoded_len = model(input_signal=audio_signal, length=a_sig_length) loss = ctc_loss( log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len, ) loss_list = [] callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss], print_func=partial(self.print_and_log_loss, loss_log_list=loss_list), step_freq=1) self.nf.train( [loss], callbacks=[callback], optimizer="sgd", optimization_params={ "max_steps": 3, "lr": 0.001 }, ) self.nf.reset_trainer() # Assert that training loss went down assert loss_list[-1] < loss_list[0]
def create_dag(args, cfg, num_gpus): # Defining nodes data = nemo_asr.TranscriptDataLayer( path=args.train_dataset, labels=cfg['target']['labels'], eos_id=cfg['target']['eos_id'], pad_id=cfg['target']['pad_id'], batch_size=cfg['optimization']['batch_size'], drop_last=True, ) data_eval = nemo_asr.AudioToTextDataLayer( manifest_filepath=args.eval_datasets, labels=cfg['target']['labels'], eos_id=cfg['target']['eos_id'], batch_size=cfg['inference']['batch_size'], load_audio=False, ) decoder = nemo.backends.pytorch.DecoderRNN( voc_size=len(cfg['target']['labels']), bos_id=cfg['target']['bos_id'], **cfg['DecoderRNN'], ) num_data = len(data) batch_size = cfg['optimization']['batch_size'] num_epochs = cfg['optimization']['params']['num_epochs'] steps_per_epoch = int(num_data / (batch_size)) total_steps = num_epochs * steps_per_epoch vsc = ValueSetterCallback tf_callback = ValueSetterCallback( decoder, 'teacher_forcing', policies=[vsc.Policy(vsc.Method.Const(1.0), start=0.0, end=1.0),], total_steps=total_steps, ) seq_loss = nemo.backends.pytorch.SequenceLoss( pad_id=cfg['target']['pad_id'], smoothing_coef=cfg['optimization']['smoothing_coef'], ) saver_callback = nemo.core.ModuleSaverCallback( save_modules_list=[decoder], folder=args.checkpoint_dir, step_freq=args.checkpoint_save_freq, ) # Creating DAG texts, _ = data() log_probs, _ = decoder(targets=texts) train_loss = seq_loss(log_probs=log_probs, targets=texts) evals = [] _, _, texts, _ = data_eval() log_probs, _ = decoder(targets=texts) eval_loss = seq_loss(log_probs=log_probs, targets=texts) evals.append((args.eval_datasets, (eval_loss, log_probs, texts))) # Update config cfg['num_params'] = {'decoder': decoder.num_weights} cfg['num_params']['total'] = sum(cfg['num_params'].values()) cfg['input']['train'] = {'num_data': num_data} cfg['optimization']['steps_per_epoch'] = steps_per_epoch cfg['optimization']['total_steps'] = total_steps return (train_loss, evals), cfg, [tf_callback, saver_callback]
def recognize_speech(): if torch.cuda.is_available(): neural_factory = nemo.core.NeuralModuleFactory( placement=nemo.core.DeviceType.GPU, optimization_level=nemo.core.Optimization.mxprO1) else: neural_factory = nemo.core.NeuralModuleFactory( placement=nemo.core.DeviceType.CPU) # noinspection PyTypeChecker asr_model: ASRConvCTCModel = nemo_asr.models.ASRConvCTCModel.from_pretrained( model_info=args.am_path) asr_model.eval() beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM( vocab=asr_model.vocabulary, beam_width=args.beam_size, alpha=args.alpha, beta=args.beta, lm_path=args.lm_path, num_cpus=max(os.cpu_count(), 1)) # Create dummy manifest with single file manifest_path = "manifest.transcription" with open(manifest_path, 'w') as f: f.write( json.dumps({ "audio_filepath": args.wav_path, "duration": 18000, "text": "todo" })) data_layer = nemo_asr.AudioToTextDataLayer(shuffle=False, manifest_filepath=manifest_path, labels=asr_model.vocabulary, batch_size=args.batch_size) audio_signal, audio_signal_len, _, _ = data_layer() log_probs, encoded_len = asr_model(input_signal=audio_signal, length=audio_signal_len) beam_predictions = beam_search_with_lm(log_probs=log_probs, log_probs_length=encoded_len) eval_tensors = [beam_predictions] tensors = neural_factory.infer(tensors=eval_tensors, use_cache=False, cache=False, offload_to_cpu=True) batch = tensors[-1][0] prediction = batch[0] candidates = [candidate[1] for candidate in prediction] with open(args.output_path, 'w') as f: for candidate in candidates: f.write(candidate + "\n")
def test_freeze_unfreeze_TrainableNM(self): path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../data/jasper_smaller.yaml")) with open(path) as file: jasper_model_definition = self.yaml.load(file) dl = nemo_asr.AudioToTextDataLayer( # featurizer_config=self.featurizer_config, manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=4, ) pre_process_params = { #'int_values': False, 'frame_splicing': 1, 'features': 64, 'window_size': 0.02, 'n_fft': 512, 'dither': 1e-05, 'window': 'hann', 'sample_rate': 16000, 'normalize': 'per_feature', 'window_stride': 0.01, } preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(**pre_process_params) jasper_encoder = nemo_asr.JasperEncoder( feat_in=jasper_model_definition['AudioToMelSpectrogramPreprocessor']['features'], **jasper_model_definition['JasperEncoder'], ) jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len(self.labels)) ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels)) jasper_encoder.freeze() jasper_encoder.unfreeze(set(['encoder.4.mconv.0.conv.weight'])) frozen_weight = jasper_encoder.encoder[1].mconv[0].conv.weight.detach().cpu().numpy() unfrozen_weight = jasper_encoder.encoder[4].mconv[0].conv.weight.detach().cpu().numpy() # jasper_decoder.unfreeze() # DAG audio_signal, a_sig_length, transcript, transcript_len = dl() processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length) encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length) # logging.info(jasper_encoder) log_probs = jasper_decoder(encoder_output=encoded) loss = ctc_loss( log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len, ) callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss], print_func=lambda x: logging.info(f'Train Loss: {str(x[0].item())}'), ) optimizer = self.nf.get_trainer() optimizer.train( [loss], callbacks=[callback], optimizer="sgd", optimization_params={"max_steps": 5, "lr": 0.0003}, ) new_frozen_weight = jasper_encoder.encoder[1].mconv[0].conv.weight.data new_unfrozen_weight = jasper_encoder.encoder[4].mconv[0].conv.weight.data self.assertTrue(np.array_equal(frozen_weight, new_frozen_weight.detach().cpu().numpy())) self.assertFalse(np.array_equal(unfrozen_weight, new_unfrozen_weight.detach().cpu().numpy()))
def test_freeze_unfreeze_TrainableNM(self): with open("tests/data/jasper_smaller.yaml") as file: jasper_model_definition = self.yaml.load(file) dl = nemo_asr.AudioToTextDataLayer( featurizer_config=self.featurizer_config, manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=4, ) pre_process_params = { 'int_values': False, 'frame_splicing': 1, 'features': 64, 'window_size': 0.02, 'n_fft': 512, 'dither': 1e-05, 'window': 'hann', 'sample_rate': 16000, 'normalize': 'per_feature', 'window_stride': 0.01, } preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(**pre_process_params) jasper_encoder = nemo_asr.JasperEncoder( feat_in=jasper_model_definition['AudioToMelSpectrogramPreprocessor']['features'], **jasper_model_definition['JasperEncoder'], ) jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len(self.labels)) ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels)) jasper_encoder.freeze() jasper_encoder.unfreeze(set(['encoder.4.conv.1.weight'])) jasper_decoder.unfreeze() # DAG audio_signal, a_sig_length, transcript, transcript_len = dl() processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length) encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length) # print(jasper_encoder) log_probs = jasper_decoder(encoder_output=encoded) loss = ctc_loss( log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len, ) callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss], print_func=lambda x: print(f'Train Loss: {str(x[0].item())}'), ) # Instantiate an optimizer to perform `train` action neural_factory = nemo.core.NeuralModuleFactory( backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False, ) optimizer = neural_factory.get_trainer() optimizer.train( [loss], callbacks=[callback], optimizer="sgd", optimization_params={"num_epochs": 2, "lr": 0.0003}, )
def test_audio_preprocessors(self): batch_size = 5 dl = nemo_asr.AudioToTextDataLayer( # featurizer_config=self.featurizer_config, manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=batch_size, # placement=DeviceType.GPU, drop_last=True, shuffle=False, ) installed_torchaudio = True try: import torchaudio except ModuleNotFoundError: installed_torchaudio = False with self.assertRaises(ModuleNotFoundError): to_spectrogram = nemo_asr.AudioToSpectrogramPreprocessor( n_fft=400, window=None) with self.assertRaises(ModuleNotFoundError): to_mfcc = nemo_asr.AudioToMFCCPreprocessor(n_mfcc=15) if installed_torchaudio: to_spectrogram = nemo_asr.AudioToSpectrogramPreprocessor( n_fft=400, window=None) to_mfcc = nemo_asr.AudioToMFCCPreprocessor(n_mfcc=15) to_melspec = nemo_asr.AudioToMelSpectrogramPreprocessor(features=50) for batch in dl.data_iterator: input_signals, seq_lengths, _, _ = batch input_signals = input_signals.to(to_melspec._device) seq_lengths = seq_lengths.to(to_melspec._device) melspec = to_melspec.forward(input_signals, seq_lengths) if installed_torchaudio: spec = to_spectrogram.forward(input_signals, seq_lengths) mfcc = to_mfcc.forward(input_signals, seq_lengths) # Check that number of features is what we expect self.assertTrue(melspec[0].shape[1] == 50) if installed_torchaudio: self.assertTrue(spec[0].shape[1] == 201) # n_fft // 2 + 1 bins self.assertTrue(mfcc[0].shape[1] == 15)
def wav_to_text(manifest, greedy=True): from ruamel.yaml import YAML yaml = YAML(typ="safe") with open(MODEL_YAML) as f: jasper_model_definition = yaml.load(f) labels = jasper_model_definition['labels'] # Instantiate necessary neural modules data_layer = nemo_asr.AudioToTextDataLayer(shuffle=False, manifest_filepath=manifest, labels=labels, batch_size=1) # Define inference DAG audio_signal, audio_signal_len, _, _ = data_layer() processed_signal, processed_signal_len = data_preprocessor( input_signal=audio_signal, length=audio_signal_len) encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=processed_signal_len) log_probs = jasper_decoder(encoder_output=encoded) predictions = greedy_decoder(log_probs=log_probs) if ENABLE_NGRAM: logging.info('Running with beam search') beam_predictions = beam_search_with_lm(log_probs=log_probs, log_probs_length=encoded_len) eval_tensors = [beam_predictions] if greedy: eval_tensors = [predictions] tensors = neural_factory.infer(tensors=eval_tensors) if greedy: from nemo.collections.asr.helpers import post_process_predictions prediction = post_process_predictions(tensors[0], labels) else: prediction = tensors[0][0][0][0][1] return prediction
def test_dataloader(self): batch_size = 4 dl = nemo_asr.AudioToTextDataLayer( # featurizer_config=self.featurizer_config, manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=batch_size, # placement=DeviceType.GPU, drop_last=True, ) for ind, data in enumerate(dl.data_iterator): # With num_workers update, this is no longer true # Moving to GPU is handled by AudioPreprocessor # data is on GPU # self.assertTrue(data[0].is_cuda) # self.assertTrue(data[1].is_cuda) # self.assertTrue(data[2].is_cuda) # self.assertTrue(data[3].is_cuda) # first dimension is batch self.assertTrue(data[0].size(0) == batch_size) self.assertTrue(data[1].size(0) == batch_size) self.assertTrue(data[2].size(0) == batch_size) self.assertTrue(data[3].size(0) == batch_size)
def test_clas(self): with open('examples/asr/experimental/configs/garnet_an4.yaml') as file: cfg = self.yaml.load(file) dl = nemo_asr.AudioToTextDataLayer( manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=4, ) pre_process_params = { 'frame_splicing': 1, 'features': 64, 'window_size': 0.02, 'n_fft': 512, 'dither': 1e-05, 'window': 'hann', 'sample_rate': 16000, 'normalize': 'per_feature', 'window_stride': 0.01, 'stft_conv': True, } preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor( **pre_process_params) encoder = nemo_asr.JasperEncoder( jasper=cfg['encoder']['jasper'], activation=cfg['encoder']['activation'], feat_in=cfg['input']['train']['features'], ) connector = nemo_asr.JasperRNNConnector( in_channels=cfg['encoder']['jasper'][-1]['filters'], out_channels=cfg['decoder']['hidden_size'], ) decoder = nemo.backends.pytorch.common.DecoderRNN( voc_size=len(self.labels), bos_id=0, hidden_size=cfg['decoder']['hidden_size'], attention_method=cfg['decoder']['attention_method'], attention_type=cfg['decoder']['attention_type'], in_dropout=cfg['decoder']['in_dropout'], gru_dropout=cfg['decoder']['gru_dropout'], attn_dropout=cfg['decoder']['attn_dropout'], teacher_forcing=cfg['decoder']['teacher_forcing'], curriculum_learning=cfg['decoder']['curriculum_learning'], rnn_type=cfg['decoder']['rnn_type'], n_layers=cfg['decoder']['n_layers'], tie_emb_out_weights=cfg['decoder']['tie_emb_out_weights'], ) loss = nemo.backends.pytorch.common.SequenceLoss() # DAG audio_signal, a_sig_length, transcripts, transcript_len = dl() processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length) encoded, encoded_len = encoder(audio_signal=processed_signal, length=p_length) encoded = connector(tensor=encoded) log_probs, _ = decoder(targets=transcripts, encoder_outputs=encoded) loss = loss(log_probs=log_probs, targets=transcripts) # Train callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss], print_func=lambda x: logging.info(str(x[0].item()))) # Instantiate an optimizer to perform `train` action optimizer = self.nf.get_trainer() optimizer.train( [loss], callbacks=[callback], optimizer="sgd", optimization_params={ "num_epochs": 10, "lr": 0.0003 }, )
def test_stft_conv(self): with open( os.path.abspath( os.path.join(os.path.dirname(__file__), "../data/jasper_smaller.yaml"))) as file: jasper_model_definition = self.yaml.load(file) dl = nemo_asr.AudioToTextDataLayer( manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=4, ) pre_process_params = { 'frame_splicing': 1, 'features': 64, 'window_size': 0.02, 'n_fft': 512, 'dither': 1e-05, 'window': 'hann', 'sample_rate': 16000, 'normalize': 'per_feature', 'window_stride': 0.01, 'stft_conv': True, } preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor( **pre_process_params) jasper_encoder = nemo_asr.JasperEncoder( feat_in=jasper_model_definition[ 'AudioToMelSpectrogramPreprocessor']['features'], **jasper_model_definition['JasperEncoder'], ) jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len( self.labels)) ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels)) # DAG audio_signal, a_sig_length, transcript, transcript_len = dl() processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length) encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length) # logging.info(jasper_encoder) log_probs = jasper_decoder(encoder_output=encoded) loss = ctc_loss( log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len, ) callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss], print_func=lambda x: logging.info(str(x[0].item()))) # Instantiate an optimizer to perform `train` action optimizer = self.nf.get_trainer() optimizer.train( [loss], callbacks=[callback], optimizer="sgd", optimization_params={ "num_epochs": 10, "lr": 0.0003 }, )
def test_tacotron2_training(self): """Integtaion test that instantiates a smaller Tacotron2 model and tests training with the sample asr data. Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss at the first step. """ data_layer = nemo_asr.AudioToTextDataLayer( manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=4 ) preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor( window_size=None, window_stride=None, n_window_size=512, n_window_stride=128, normalize=None, preemph=None, dither=0, mag_power=1.0, pad_value=-11.52, log_zero_guard_type="clamp", log_zero_guard_value=1e-05, ) text_embedding = nemo_tts.TextEmbedding(len(self.labels), 256) t2_enc = nemo_tts.Tacotron2Encoder(encoder_n_convolutions=2, encoder_kernel_size=5, encoder_embedding_dim=256) t2_dec = nemo_tts.Tacotron2Decoder( n_mel_channels=64, n_frames_per_step=1, encoder_embedding_dim=256, gate_threshold=0.5, prenet_dim=128, max_decoder_steps=1000, decoder_rnn_dim=512, p_decoder_dropout=0.1, p_attention_dropout=0.1, attention_rnn_dim=512, attention_dim=64, attention_location_n_filters=16, attention_location_kernel_size=15, ) t2_postnet = nemo_tts.Tacotron2Postnet( n_mel_channels=64, postnet_embedding_dim=256, postnet_kernel_size=5, postnet_n_convolutions=3 ) t2_loss = nemo_tts.Tacotron2Loss() makegatetarget = nemo_tts.MakeGate() # DAG audio, audio_len, transcript, transcript_len = data_layer() spec_target, spec_target_len = preprocessing(input_signal=audio, length=audio_len) transcript_embedded = text_embedding(char_phone=transcript) transcript_encoded = t2_enc(char_phone_embeddings=transcript_embedded, embedding_length=transcript_len) mel_decoder, gate, _ = t2_dec( char_phone_encoded=transcript_encoded, encoded_length=transcript_len, mel_target=spec_target ) mel_postnet = t2_postnet(mel_input=mel_decoder) gate_target = makegatetarget(mel_target=spec_target, target_len=spec_target_len) loss_t = t2_loss( mel_out=mel_decoder, mel_out_postnet=mel_postnet, gate_out=gate, mel_target=spec_target, gate_target=gate_target, target_len=spec_target_len, seq_len=audio_len, ) loss_list = [] callback = SimpleLossLoggerCallback( tensors=[loss_t], print_func=partial(self.print_and_log_loss, loss_log_list=loss_list), step_freq=1 ) # Instantiate an optimizer to perform `train` action optimizer = PtActions() optimizer.train( [loss_t], callbacks=[callback], optimizer="sgd", optimization_params={"max_steps": 3, "lr": 0.01} ) # Assert that training loss went down assert loss_list[-1] < loss_list[0]
def create_all_dags(args, neural_factory): ''' creates train and eval dags as well as their callbacks returns train loss tensor and callbacks''' # parse the config files yaml = YAML(typ="safe") with open(args.model_config) as f: quartz_params = yaml.load(f) try: vocab = quartz_params['labels'] sample_rate = quartz_params['sample_rate'] except KeyError: logging.error("Please make sure you are using older config format (the ones with -old suffix)") exit(1) # Calculate num_workers for dataloader total_cpus = os.cpu_count() cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1) # create data layer for training train_dl_params = copy.deepcopy(quartz_params["AudioToTextDataLayer"]) train_dl_params.update(quartz_params["AudioToTextDataLayer"]["train"]) del train_dl_params["train"] del train_dl_params["eval"] # del train_dl_params["normalize_transcripts"] data_layer_train = nemo_asr.AudioToTextDataLayer( manifest_filepath=args.train_dataset, sample_rate=sample_rate, labels=vocab, batch_size=args.batch_size, num_workers=cpu_per_traindl, **train_dl_params, # normalize_transcripts=False ) N = len(data_layer_train) steps_per_epoch = int(N / (args.batch_size * args.iter_per_step * args.num_gpus)) # create separate data layers for eval # we need separate eval dags for separate eval datasets # but all other modules in these dags will be shared eval_dl_params = copy.deepcopy(quartz_params["AudioToTextDataLayer"]) eval_dl_params.update(quartz_params["AudioToTextDataLayer"]["eval"]) del eval_dl_params["train"] del eval_dl_params["eval"] data_layers_eval = [] if args.eval_datasets: for eval_dataset in args.eval_datasets: data_layer_eval = nemo_asr.AudioToTextDataLayer( manifest_filepath=eval_dataset, sample_rate=sample_rate, labels=vocab, batch_size=args.eval_batch_size, num_workers=cpu_per_traindl, **eval_dl_params, ) data_layers_eval.append(data_layer_eval) else: logging.warning("There were no val datasets passed") # create shared modules data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( sample_rate=sample_rate, **quartz_params["AudioToMelSpectrogramPreprocessor"], ) # (QuartzNet uses the Jasper baseline encoder and decoder) encoder = nemo_asr.JasperEncoder( feat_in=quartz_params["AudioToMelSpectrogramPreprocessor"]["features"], **quartz_params["JasperEncoder"], ) decoder = nemo_asr.JasperDecoderForCTC( feat_in=quartz_params["JasperEncoder"]["jasper"][-1]["filters"], num_classes=len(vocab), ) ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab)) greedy_decoder = nemo_asr.GreedyCTCDecoder() # create augmentation modules (only used for training) if their configs # are present multiply_batch_config = quartz_params.get('MultiplyBatch', None) if multiply_batch_config: multiply_batch = nemo_asr.MultiplyBatch(**multiply_batch_config) spectr_augment_config = quartz_params.get('SpectrogramAugmentation', None) if spectr_augment_config: data_spectr_augmentation = nemo_asr.SpectrogramAugmentation(**spectr_augment_config) # assemble train DAG (audio_signal_t, a_sig_length_t, transcript_t, transcript_len_t,) = data_layer_train() processed_signal_t, p_length_t = data_preprocessor(input_signal=audio_signal_t, length=a_sig_length_t) if multiply_batch_config: (processed_signal_t, p_length_t, transcript_t, transcript_len_t,) = multiply_batch( in_x=processed_signal_t, in_x_len=p_length_t, in_y=transcript_t, in_y_len=transcript_len_t, ) if spectr_augment_config: processed_signal_t = data_spectr_augmentation(input_spec=processed_signal_t) encoded_t, encoded_len_t = encoder(audio_signal=processed_signal_t, length=p_length_t) log_probs_t = decoder(encoder_output=encoded_t) predictions_t = greedy_decoder(log_probs=log_probs_t) loss_t = ctc_loss( log_probs=log_probs_t, targets=transcript_t, input_length=encoded_len_t, target_length=transcript_len_t, ) # create train callbacks train_callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss_t, predictions_t, transcript_t, transcript_len_t], print_func=partial(monitor_asr_train_progress, labels=vocab), get_tb_values=lambda x: [["loss", x[0]]], tb_writer=neural_factory.tb_writer, ) callbacks = [train_callback] if args.checkpoint_dir or args.load_dir: chpt_callback = nemo.core.CheckpointCallback( folder=args.checkpoint_dir, load_from_folder=args.load_dir, step_freq=args.checkpoint_save_freq, ) callbacks.append(chpt_callback) # assemble eval DAGs for i, eval_dl in enumerate(data_layers_eval): (audio_signal_e, a_sig_length_e, transcript_e, transcript_len_e,) = eval_dl() processed_signal_e, p_length_e = data_preprocessor(input_signal=audio_signal_e, length=a_sig_length_e) encoded_e, encoded_len_e = encoder(audio_signal=processed_signal_e, length=p_length_e) log_probs_e = decoder(encoder_output=encoded_e) predictions_e = greedy_decoder(log_probs=log_probs_e) loss_e = ctc_loss( log_probs=log_probs_e, targets=transcript_e, input_length=encoded_len_e, target_length=transcript_len_e, ) # create corresponding eval callback tagname = os.path.basename(args.eval_datasets[i]).split(".")[0] eval_callback = nemo.core.EvaluatorCallback( eval_tensors=[loss_e, predictions_e, transcript_e, transcript_len_e,], user_iter_callback=partial(process_evaluation_batch, labels=vocab), user_epochs_done_callback=partial(process_evaluation_epoch, tag=tagname), eval_step=args.eval_freq, tb_writer=neural_factory.tb_writer, ) callbacks.append(eval_callback) return loss_t, callbacks, steps_per_epoch
def test_contextnet_ctc_training(self): """Integtaion test that instantiates a small ContextNet model and tests training with the sample asr data. Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss at the first step. Note: Training is done with batch gradient descent as opposed to stochastic gradient descent due to CTC loss Checks SE-block with fixed context size and global context, residual_mode='stride_add' and 'stride_last' flags """ with open( os.path.abspath( os.path.join(os.path.dirname(__file__), "../data/contextnet_32.yaml"))) as f: contextnet_model_definition = self.yaml.load(f) dl = nemo_asr.AudioToTextDataLayer( manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=30) pre_process_params = { 'frame_splicing': 1, 'features': 80, 'window_size': 0.025, 'n_fft': 512, 'dither': 1e-05, 'window': 'hann', 'sample_rate': 16000, 'normalize': 'per_feature', 'window_stride': 0.01, } preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor( **pre_process_params) spec_aug = nemo_asr.SpectrogramAugmentation( **contextnet_model_definition['SpectrogramAugmentation']) contextnet_encoder = nemo_asr.ContextNetEncoder( feat_in=contextnet_model_definition[ 'AudioToMelSpectrogramPreprocessor']['features'], **contextnet_model_definition['ContextNetEncoder'], ) contextnet_decoder = nemo_asr.ContextNetDecoderForCTC(feat_in=32, hidden_size=16, num_classes=len( self.labels)) ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels)) # DAG audio_signal, a_sig_length, transcript, transcript_len = dl() processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length) processed_signal = spec_aug(input_spec=processed_signal) encoded, encoded_len = contextnet_encoder( audio_signal=processed_signal, length=p_length) log_probs = contextnet_decoder(encoder_output=encoded) loss = ctc_loss( log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len, ) loss_list = [] callback = SimpleLossLoggerCallback(tensors=[loss], print_func=partial( self.print_and_log_loss, loss_log_list=loss_list), step_freq=1) self.nf.train( [loss], callbacks=[callback], optimizer="sgd", optimization_params={ "max_steps": 3, "lr": 0.001 }, ) self.nf.reset_trainer() # Assert that training loss went down assert loss_list[-1] < loss_list[0]
def main(): parser = argparse.ArgumentParser(description='Jasper') # model params parser.add_argument("--model_config", type=str, required=True) parser.add_argument("--eval_datasets", type=str, required=True) parser.add_argument("--load_dir", type=str, required=True) # run params parser.add_argument("--local_rank", default=None, type=int) parser.add_argument("--batch_size", default=64, type=int) parser.add_argument("--amp_opt_level", default="O1", type=str) # store results parser.add_argument("--save_logprob", default=None, type=str) # lm inference parameters parser.add_argument("--lm_path", default=None, type=str) parser.add_argument('--alpha', default=2.0, type=float, help='value of LM weight', required=False) parser.add_argument( '--alpha_max', type=float, help='maximum value of LM weight (for a grid search in \'eval\' mode)', required=False, ) parser.add_argument('--alpha_step', type=float, help='step for LM weight\'s tuning in \'eval\' mode', required=False, default=0.1) parser.add_argument('--beta', default=1.5, type=float, help='value of word count weight', required=False) parser.add_argument( '--beta_max', type=float, help='maximum value of word count weight (for a grid search in \ \'eval\' mode', required=False, ) parser.add_argument( '--beta_step', type=float, help='step for word count weight\'s tuning in \'eval\' mode', required=False, default=0.1, ) parser.add_argument("--beam_width", default=128, type=int) args = parser.parse_args() batch_size = args.batch_size load_dir = args.load_dir if args.local_rank is not None: if args.lm_path: raise NotImplementedError( "Beam search decoder with LM does not currently support evaluation on multi-gpu." ) device = nemo.core.DeviceType.AllGpu else: device = nemo.core.DeviceType.GPU # Instantiate Neural Factory with supported backend neural_factory = nemo.core.NeuralModuleFactory( backend=nemo.core.Backend.PyTorch, local_rank=args.local_rank, optimization_level=args.amp_opt_level, placement=device, ) if args.local_rank is not None: logging.info('Doing ALL GPU') yaml = YAML(typ="safe") with open(args.model_config) as f: jasper_params = yaml.load(f) vocab = jasper_params['labels'] sample_rate = jasper_params['sample_rate'] eval_datasets = args.eval_datasets eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"]) eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"]) del eval_dl_params["train"] del eval_dl_params["eval"] data_layer = nemo_asr.AudioToTextDataLayer( manifest_filepath=eval_datasets, sample_rate=sample_rate, labels=vocab, batch_size=batch_size, **eval_dl_params, ) N = len(data_layer) logging.info('Evaluating {0} examples'.format(N)) data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( sample_rate=sample_rate, **jasper_params["AudioToMelSpectrogramPreprocessor"]) jasper_encoder = nemo_asr.JasperEncoder( feat_in=jasper_params["AudioToMelSpectrogramPreprocessor"]["features"], **jasper_params["JasperEncoder"]) jasper_decoder = nemo_asr.JasperDecoderForCTC( feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"], num_classes=len(vocab)) greedy_decoder = nemo_asr.GreedyCTCDecoder() logging.info('================================') logging.info( f"Number of parameters in encoder: {jasper_encoder.num_weights}") logging.info( f"Number of parameters in decoder: {jasper_decoder.num_weights}") logging.info(f"Total number of parameters in model: " f"{jasper_decoder.num_weights + jasper_encoder.num_weights}") logging.info('================================') # Define inference DAG audio_signal_e1, a_sig_length_e1, transcript_e1, transcript_len_e1 = data_layer( ) processed_signal_e1, p_length_e1 = data_preprocessor( input_signal=audio_signal_e1, length=a_sig_length_e1) encoded_e1, encoded_len_e1 = jasper_encoder( audio_signal=processed_signal_e1, length=p_length_e1) log_probs_e1 = jasper_decoder(encoder_output=encoded_e1) predictions_e1 = greedy_decoder(log_probs=log_probs_e1) eval_tensors = [ log_probs_e1, predictions_e1, transcript_e1, transcript_len_e1, encoded_len_e1 ] # inference evaluated_tensors = neural_factory.infer(tensors=eval_tensors, checkpoint_dir=load_dir) greedy_hypotheses = post_process_predictions(evaluated_tensors[1], vocab) references = post_process_transcripts(evaluated_tensors[2], evaluated_tensors[3], vocab) wer = word_error_rate(hypotheses=greedy_hypotheses, references=references) logging.info("Greedy WER {:.2f}%".format(wer * 100)) # Convert logits to list of numpy arrays logprob = [] for i, batch in enumerate(evaluated_tensors[0]): for j in range(batch.shape[0]): logprob.append( batch[j][:evaluated_tensors[4][i][j], :].cpu().numpy()) if args.save_logprob: with open(args.save_logprob, 'wb') as f: pickle.dump(logprob, f, protocol=pickle.HIGHEST_PROTOCOL) # language model if args.lm_path: if args.alpha_max is None: args.alpha_max = args.alpha # include alpha_max in tuning range args.alpha_max += args.alpha_step / 10.0 if args.beta_max is None: args.beta_max = args.beta # include beta_max in tuning range args.beta_max += args.beta_step / 10.0 beam_wers = [] logprobexp = [np.exp(p) for p in logprob] for alpha in np.arange(args.alpha, args.alpha_max, args.alpha_step): for beta in np.arange(args.beta, args.beta_max, args.beta_step): logging.info('================================') logging.info(f'Infering with (alpha, beta): ({alpha}, {beta})') beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM( vocab=vocab, beam_width=args.beam_width, alpha=alpha, beta=beta, lm_path=args.lm_path, num_cpus=max(os.cpu_count(), 1), input_tensor=False, ) beam_predictions = beam_search_with_lm(log_probs=logprobexp, log_probs_length=None, force_pt=True) beam_predictions = [b[0][1] for b in beam_predictions[0]] lm_wer = word_error_rate(hypotheses=beam_predictions, references=references) logging.info("Beam WER {:.2f}%".format(lm_wer * 100)) beam_wers.append(((alpha, beta), lm_wer * 100)) logging.info('Beam WER for (alpha, beta)') logging.info('================================') logging.info('\n' + '\n'.join([str(e) for e in beam_wers])) logging.info('================================') best_beam_wer = min(beam_wers, key=lambda x: x[1]) logging.info('Best (alpha, beta): ' f'{best_beam_wer[0]}, ' f'WER: {best_beam_wer[1]:.2f}%')
def test_clas(self): with open('examples/asr/experimental/configs/garnet_an4.yaml') as file: cfg = self.yaml.load(file) dl = nemo_asr.AudioToTextDataLayer( featurizer_config=self.featurizer_config, manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=4, ) pre_process_params = { 'int_values': False, 'frame_splicing': 1, 'features': 64, 'window_size': 0.02, 'n_fft': 512, 'dither': 1e-05, 'window': 'hann', 'sample_rate': 16000, 'normalize': 'per_feature', 'window_stride': 0.01, 'stft_conv': True, } preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor( **pre_process_params) encoder = nemo_asr.JasperEncoder( jasper=cfg['encoder']['jasper'], activation=cfg['encoder']['activation'], feat_in=cfg['input']['train']['features'], ) connector = nemo_asr.JasperRNNConnector( in_channels=cfg['encoder']['jasper'][-1]['filters'], out_channels=cfg['decoder']['hidden_size'], ) decoder = nemo.backends.pytorch.common.DecoderRNN( voc_size=len(self.labels), bos_id=0, **cfg['decoder'] # fictive ) loss = nemo.backends.pytorch.common.SequenceLoss() # DAG audio_signal, a_sig_length, transcripts, transcript_len = dl() processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length) encoded, encoded_len = encoder(audio_signal=processed_signal, length=p_length) encoded = connector(tensor=encoded) log_probs, _ = decoder(targets=transcripts, encoder_outputs=encoded) loss = loss(log_probs=log_probs, targets=transcripts) # Train callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss], print_func=lambda x: print(str(x[0].item()))) # Instantiate an optimizer to perform `train` action neural_factory = nemo.core.NeuralModuleFactory( backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False, ) optimizer = neural_factory.get_trainer() optimizer.train( [loss], callbacks=[callback], optimizer="sgd", optimization_params={ "num_epochs": 10, "lr": 0.0003 }, )
def test_tacotron2_training(self): data_layer = nemo_asr.AudioToTextDataLayer( manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=4, ) preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor( window_size=None, window_stride=None, n_window_size=512, n_window_stride=128, normalize=None, preemph=None, dither=0, mag_power=1.0, pad_value=-11.52, ) text_embedding = nemo_tts.TextEmbedding(len(self.labels), 256) t2_enc = nemo_tts.Tacotron2Encoder( encoder_n_convolutions=2, encoder_kernel_size=5, encoder_embedding_dim=256, ) t2_dec = nemo_tts.Tacotron2Decoder( n_mel_channels=64, n_frames_per_step=1, encoder_embedding_dim=256, gate_threshold=0.5, prenet_dim=128, max_decoder_steps=1000, decoder_rnn_dim=512, p_decoder_dropout=0.1, p_attention_dropout=0.1, attention_rnn_dim=512, attention_dim=64, attention_location_n_filters=16, attention_location_kernel_size=15, ) t2_postnet = nemo_tts.Tacotron2Postnet( n_mel_channels=64, postnet_embedding_dim=256, postnet_kernel_size=5, postnet_n_convolutions=3, ) t2_loss = nemo_tts.Tacotron2Loss() makegatetarget = nemo_tts.MakeGate() # DAG audio, audio_len, transcript, transcript_len = data_layer() spec_target, spec_target_len = preprocessing(input_signal=audio, length=audio_len) transcript_embedded = text_embedding(char_phone=transcript) transcript_encoded = t2_enc( char_phone_embeddings=transcript_embedded, embedding_length=transcript_len, ) mel_decoder, gate, _ = t2_dec( char_phone_encoded=transcript_encoded, encoded_length=transcript_len, mel_target=spec_target, ) mel_postnet = t2_postnet(mel_input=mel_decoder) gate_target = makegatetarget(mel_target=spec_target, target_len=spec_target_len) loss_t = t2_loss( mel_out=mel_decoder, mel_out_postnet=mel_postnet, gate_out=gate, mel_target=spec_target, gate_target=gate_target, target_len=spec_target_len, seq_len=audio_len, ) callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss_t], print_func=lambda x: logging.info(f'Train Loss: {str(x[0].item())}' ), ) # Instantiate an optimizer to perform `train` action optimizer = nemo.backends.pytorch.actions.PtActions() optimizer.train( [loss_t], callbacks=[callback], optimizer="sgd", optimization_params={ "num_epochs": 10, "lr": 0.0003 }, )
def main(): # Usage and Command line arguments parser = ArgumentParser() parser.add_argument( "--asr_model", type=str, default="QuartzNet15x5-En", required=True, help= "Pass: '******', 'QuartzNet15x5-Zh', or 'JasperNet10x5-En' to train from pre-trained models. To train from scratch pass path to modelfile ending with .yaml.", ) parser.add_argument( "--amp_opt_level", default="O0", type=str, choices=["O0", "O1", "O2", "O3"], help="See: https://nvidia.github.io/apex/amp.html", ) parser.add_argument("--train_dataset", type=str, required=True, default=None, help="training dataset path") parser.add_argument("--eval_datasets", type=str, nargs="*", help="evaluation datasets paths") parser.add_argument("--eval_freq", default=1000, type=int, help="Evaluation frequency") parser.add_argument("--eval_batch_size", type=int, default=8, help="batch size to use for evaluation") parser.add_argument("--local_rank", default=None, type=int, help="node rank for distributed training") parser.add_argument("--stats_freq", default=25, type=int, help="frequency with which to update train stats") parser.add_argument("--checkpoint_dir", default=None, type=str, help="Folder where to save checkpoints") parser.add_argument("--checkpoint_save_freq", required=False, type=int, help="how often to checkpoint") parser.add_argument("--optimizer", default="novograd", type=str) parser.add_argument("--warmup_ratio", default=0.02, type=float, help="learning rate warmup ratio") parser.add_argument("--batch_size", required=True, type=int, help="train batch size per GPU") parser.add_argument("--num_epochs", default=5, type=int, help="number of epochs to train") parser.add_argument("--lr", default=0.01, type=float) parser.add_argument("--beta1", default=0.95, type=float) parser.add_argument("--beta2", default=0.5, type=float) parser.add_argument("--weight_decay", default=0.001, type=float) parser.add_argument("--iter_per_step", default=1, type=int, help="number of grad accumulations per batch") parser.add_argument("--wandb_exp_name", default=None, type=str) parser.add_argument("--wandb_project", default=None, type=str) parser.add_argument("--max_train_audio_len", default=16.7, type=float, help="max audio length") parser.add_argument("--do_not_trim_silence", action="store_false", help="Add this flag to disable silence trimming") parser.add_argument("--do_not_normalize_text", action="store_false", help="Add this flag to set to False for non-English.") args = parser.parse_args() # Setup NeuralModuleFactory to control training # instantiate Neural Factory with supported backend nf = nemo.core.NeuralModuleFactory( local_rank=args. local_rank, # This is necessary for distributed training optimization_level=args. amp_opt_level, # This is necessary for mixed precision optimization cudnn_benchmark=True, ) # Instantiate the model which we'll train if args.asr_model.endswith('.yaml'): logging.info( f"Speech2Text: Will train from scratch using config from {args.asr_model}" ) asr_model = nemo_asr.models.ASRConvCTCModel.import_from_config( args.asr_model) else: logging.info(f"Speech2Text: Will fine-tune from {args.asr_model}") asr_model = nemo_asr.models.ASRConvCTCModel.from_pretrained( model_info=args.asr_model, local_rank=args.local_rank) if args.asr_model.strip().endswith('-Zh'): logging.info('USING CER') eval_metric = 'CER' else: eval_metric = 'WER' logging.info("\n\n") logging.info(f"Speech2Text: Training on {nf.world_size} GPUs.") logging.info(f"Training {type(asr_model)} model.") logging.info(f"Training CTC model with alphabet {asr_model.vocabulary}.") logging.info( f"Training CTC model with {asr_model.num_weights} weights.\n\n") train_data_layer = nemo_asr.AudioToTextDataLayer( manifest_filepath=args.train_dataset, labels=asr_model.vocabulary, batch_size=args.batch_size, trim_silence=args.do_not_trim_silence, max_duration=args.max_train_audio_len, shuffle=True, normalize_transcripts=args.do_not_normalize_text, ) ctc_loss = nemo_asr.CTCLossNM(num_classes=len(asr_model.vocabulary)) greedy_decoder = nemo_asr.GreedyCTCDecoder() audio_signal, audio_signal_len, transcript, transcript_len = train_data_layer( ) log_probs, encoded_len = asr_model(input_signal=audio_signal, length=audio_signal_len) predictions = greedy_decoder(log_probs=log_probs) loss = ctc_loss(log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len) # Callbacks which we'll be using: callbacks = [] # SimpleLossLogger prints basic training stats (e.g. loss) to console train_callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss, predictions, transcript, transcript_len], step_freq=args.stats_freq, print_func=partial(monitor_asr_train_progress, labels=asr_model.vocabulary, eval_metric=eval_metric), ) callbacks.append(train_callback) if args.checkpoint_dir is not None and args.checkpoint_save_freq is not None: # Checkpoint callback saves checkpoints periodically checkpointer_callback = nemo.core.CheckpointCallback( folder=args.checkpoint_dir, step_freq=args.checkpoint_save_freq) callbacks.append(checkpointer_callback) if args.wandb_exp_name is not None and args.wandb_project is not None: # WandbCallback saves stats to Weights&Biases wandb_callback = nemo.core.WandBLogger( step_freq=args.stats_freq, wandb_name=args.wandb_exp_name, wandb_project=args.wandb_project, args=args) callbacks.append(wandb_callback) # Evaluation if args.eval_datasets is not None and args.eval_freq is not None: asr_model.eval() # switch model to evaluation mode logging.info(f"Will perform evaluation every {args.eval_freq} steps.") for ind, eval_dataset in enumerate(args.eval_datasets): eval_data_layer = nemo_asr.AudioToTextDataLayer( manifest_filepath=eval_dataset, labels=asr_model.vocabulary, batch_size=args.eval_batch_size, normalize_transcripts=args.do_not_normalize_text, ) audio_signal, audio_signal_len, transcript, transcript_len = eval_data_layer( ) log_probs, encoded_len = asr_model(input_signal=audio_signal, length=audio_signal_len) eval_predictions = greedy_decoder(log_probs=log_probs) eval_loss = ctc_loss(log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len) tag_name = os.path.basename(eval_dataset).split(".")[0] eval_callback = nemo.core.EvaluatorCallback( eval_tensors=[ eval_loss, eval_predictions, transcript, transcript_len ], user_iter_callback=partial(process_evaluation_batch, labels=asr_model.vocabulary), user_epochs_done_callback=partial(process_evaluation_epoch, tag=tag_name, eval_metric=eval_metric), eval_step=args.eval_freq, wandb_name=args.wandb_exp_name, wandb_project=args.wandb_project, ) callbacks.append(eval_callback) steps_in_epoch = len(train_data_layer) / ( args.batch_size * args.iter_per_step * nf.world_size) lr_policy = CosineAnnealing(total_steps=args.num_epochs * steps_in_epoch, warmup_ratio=args.warmup_ratio) nf.train( tensors_to_optimize=[loss], callbacks=callbacks, optimizer=args.optimizer, optimization_params={ "num_epochs": args.num_epochs, "lr": args.lr, "betas": (args.beta1, args.beta2), "weight_decay": args.weight_decay, }, batches_per_step=args.iter_per_step, lr_policy=lr_policy, )
def create_dags(jasper_params, args, nf): vocab = jasper_params['labels'] # build train and eval model train_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"]) train_dl_params.update(jasper_params["AudioToTextDataLayer"]["train"]) del train_dl_params["train"] del train_dl_params["eval"] data_layer = nemo_asr.AudioToTextDataLayer( manifest_filepath=args.train_dataset, labels=vocab, batch_size=args.batch_size, **train_dl_params, ) num_samples = len(data_layer) steps_per_epoch = math.ceil( num_samples / (args.batch_size * args.iter_per_step * nf.world_size)) total_steps = steps_per_epoch * args.num_epochs logging.info("Train samples=", num_samples, "num_steps=", total_steps) data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( **jasper_params["AudioToMelSpectrogramPreprocessor"]) # data_augmentation = nemo_asr.SpectrogramAugmentation( # **jasper_params['SpectrogramAugmentation'] # ) eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"]) eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"]) del eval_dl_params["train"] del eval_dl_params["eval"] data_layer_eval = nemo_asr.AudioToTextDataLayer( manifest_filepath=args.eval_datasets, labels=vocab, batch_size=args.eval_batch_size, **eval_dl_params, ) num_samples = len(data_layer_eval) logging.info(f"Eval samples={num_samples}") jasper_encoder = nemo_asr.JasperEncoder(**jasper_params["JasperEncoder"]) jasper_decoder = nemo_asr.JasperDecoderForCTC( num_classes=len(vocab), **jasper_params["JasperDecoderForCTC"]) ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab)) greedy_decoder = nemo_asr.GreedyCTCDecoder() # Training model audio, audio_len, transcript, transcript_len = data_layer() processed, processed_len = data_preprocessor(input_signal=audio, length=audio_len) encoded, encoded_len = jasper_encoder(audio_signal=processed, length=processed_len) log_probs = jasper_decoder(encoder_output=encoded) predictions = greedy_decoder(log_probs=log_probs) loss = ctc_loss( log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len, ) # Evaluation model audio_e, audio_len_e, transcript_e, transcript_len_e = data_layer_eval() processed_e, processed_len_e = data_preprocessor(input_signal=audio_e, length=audio_len_e) encoded_e, encoded_len_e = jasper_encoder(audio_signal=processed_e, length=processed_len_e) log_probs_e = jasper_decoder(encoder_output=encoded_e) predictions_e = greedy_decoder(log_probs=log_probs_e) loss_e = ctc_loss( log_probs=log_probs_e, targets=transcript_e, input_length=encoded_len_e, target_length=transcript_len_e, ) logging.info("Num of params in encoder: {0}".format( jasper_encoder.num_weights)) # Callbacks to print info to console and Tensorboard train_callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss, predictions, transcript, transcript_len], print_func=partial(monitor_asr_train_progress, labels=vocab), get_tb_values=lambda x: [["loss", x[0]]], tb_writer=nf.tb_writer, ) checkpointer_callback = nemo.core.CheckpointCallback( folder=nf.checkpoint_dir, step_freq=args.checkpoint_save_freq) eval_tensors = [loss_e, predictions_e, transcript_e, transcript_len_e] eval_callback = nemo.core.EvaluatorCallback( eval_tensors=eval_tensors, user_iter_callback=partial(process_evaluation_batch, labels=vocab), user_epochs_done_callback=process_evaluation_epoch, eval_step=args.eval_freq, tb_writer=nf.tb_writer, ) callbacks = [train_callback, checkpointer_callback, eval_callback] return ( loss, eval_tensors, callbacks, total_steps, vocab, log_probs_e, encoded_len_e, )
def main(): # Usage and Command line arguments parser = ArgumentParser() parser.add_argument( "--asr_model", type=str, default="QuartzNet15x5-En", required=True, help= "Pass: '******', 'QuartzNet15x5-Zh', or 'JasperNet10x5-En'", ) parser.add_argument("--dataset", type=str, required=True, help="path to evaluation data") parser.add_argument("--eval_batch_size", type=int, default=1, help="batch size to use for evaluation") parser.add_argument("--wer_target", type=float, default=None, help="used by test") parser.add_argument("--wer_tolerance", type=float, default=1.0, help="used by test") parser.add_argument("--trim_silence", default=True, type=bool, help="trim audio from silence or not") parser.add_argument( "--normalize_text", default=True, type=bool, help="Normalize transcripts or not. Set to False for non-English.") args = parser.parse_args() # Setup NeuralModuleFactory to control training # instantiate Neural Factory with supported backend nf = nemo.core.NeuralModuleFactory() # Instantiate the model which we'll train logging.info(f"Speech2Text: Will fine-tune from {args.asr_model}") asr_model = nemo_asr.models.ASRConvCTCModel.from_pretrained( model_info=args.asr_model) asr_model.eval() logging.info("\n\n") logging.info(f"Evaluation using {type(asr_model)} model.") logging.info(f"Evaluation using alphabet {asr_model.vocabulary}.") logging.info(f"The model has {asr_model.num_weights} weights.\n\n") eval_data_layer = nemo_asr.AudioToTextDataLayer( manifest_filepath=args.dataset, labels=asr_model.vocabulary, batch_size=args.eval_batch_size, trim_silence=args.trim_silence, shuffle=False, normalize_transcripts=args.normalize_text, ) greedy_decoder = nemo_asr.GreedyCTCDecoder() audio_signal, audio_signal_len, transcript, transcript_len = eval_data_layer( ) log_probs, encoded_len = asr_model(input_signal=audio_signal, length=audio_signal_len) predictions = greedy_decoder(log_probs=log_probs) # inference eval_tensors = [ log_probs, predictions, transcript, transcript_len, encoded_len ] evaluated_tensors = nf.infer(tensors=eval_tensors) greedy_hypotheses = post_process_predictions(evaluated_tensors[1], asr_model.vocabulary) references = post_process_transcripts(evaluated_tensors[2], evaluated_tensors[3], asr_model.vocabulary) if args.asr_model.strip().endswith('-Zh'): val = word_error_rate(hypotheses=greedy_hypotheses, references=references, use_cer=True) metric = 'CER' else: val = word_error_rate(hypotheses=greedy_hypotheses, references=references, use_cer=False) metric = 'WER' logging.info(f"Greedy {metric} = {val}") if args.wer_target is not None: if args.wer_target * args.wer_tolerance < wer: raise ValueError( f"Resulting WER {wer} is higher than the target {args.wer_target}" )
def test_stft_conv_training(self): """Integtaion test that instantiates a small Jasper model and tests training with the sample asr data. test_stft_conv_training tests the torch_stft path while test_jasper_training tests the torch.stft path inside of AudioToMelSpectrogramPreprocessor. Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss at the first step. Note: Training is done with batch gradient descent as opposed to stochastic gradient descent due to CTC loss """ with open( os.path.abspath( os.path.join(os.path.dirname(__file__), "../data/jasper_smaller.yaml"))) as file: jasper_model_definition = self.yaml.load(file) dl = nemo_asr.AudioToTextDataLayer( manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=30) pre_process_params = { 'frame_splicing': 1, 'features': 64, 'window_size': 0.02, 'n_fft': 512, 'dither': 1e-05, 'window': 'hann', 'sample_rate': 16000, 'normalize': 'per_feature', 'window_stride': 0.01, 'stft_conv': True, } preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor( **pre_process_params) jasper_encoder = nemo_asr.JasperEncoder( feat_in=jasper_model_definition[ 'AudioToMelSpectrogramPreprocessor']['features'], **jasper_model_definition['JasperEncoder'], ) jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len( self.labels)) ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels)) # DAG audio_signal, a_sig_length, transcript, transcript_len = dl() processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length) encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length) # logging.info(jasper_encoder) log_probs = jasper_decoder(encoder_output=encoded) loss = ctc_loss( log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len, ) loss_list = [] callback = SimpleLossLoggerCallback(tensors=[loss], print_func=partial( self.print_and_log_loss, loss_log_list=loss_list), step_freq=1) self.nf.train( [loss], callbacks=[callback], optimizer="sgd", optimization_params={ "max_steps": 3, "lr": 0.001 }, ) self.nf.reset_trainer() # Assert that training loss went down assert loss_list[-1] < loss_list[0]
def test_jasper_eval(self): with open( os.path.abspath( os.path.join(os.path.dirname(__file__), "../data/jasper_smaller.yaml"))) as file: jasper_model_definition = self.yaml.load(file) dl = nemo_asr.AudioToTextDataLayer( manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=4, ) pre_process_params = { 'frame_splicing': 1, 'features': 64, 'window_size': 0.02, 'n_fft': 512, 'dither': 1e-05, 'window': 'hann', 'sample_rate': 16000, 'normalize': 'per_feature', 'window_stride': 0.01, } preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor( **pre_process_params) jasper_encoder = nemo_asr.JasperEncoder( feat_in=jasper_model_definition[ 'AudioToMelSpectrogramPreprocessor']['features'], **jasper_model_definition['JasperEncoder'], ) jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len( self.labels)) ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels)) greedy_decoder = nemo_asr.GreedyCTCDecoder() # DAG audio_signal, a_sig_length, transcript, transcript_len = dl() processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length) encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length) # logging.info(jasper_encoder) log_probs = jasper_decoder(encoder_output=encoded) loss = ctc_loss( log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len, ) predictions = greedy_decoder(log_probs=log_probs) from nemo.collections.asr.helpers import ( process_evaluation_batch, process_evaluation_epoch, ) eval_callback = nemo.core.EvaluatorCallback( eval_tensors=[loss, predictions, transcript, transcript_len], user_iter_callback=lambda x, y: process_evaluation_batch( x, y, labels=self.labels), user_epochs_done_callback=process_evaluation_epoch, ) # Instantiate an optimizer to perform `train` action self.nf.eval(callbacks=[eval_callback])
def create_dag(args, cfg, logger, num_gpus): # Defining nodes data = nemo_asr.AudioToTextDataLayer( manifest_filepath=args.train_dataset, labels=cfg['target']['labels'], batch_size=cfg['optimization']['batch_size'], eos_id=cfg['target']['eos_id'], **cfg['AudioToTextDataLayer']['train'], ) data_evals = [] if args.eval_datasets: for val_path in args.eval_datasets: data_evals.append( nemo_asr.AudioToTextDataLayer( manifest_filepath=val_path, labels=cfg['target']['labels'], batch_size=cfg['inference']['batch_size'], eos_id=cfg['target']['eos_id'], **cfg['AudioToTextDataLayer']['eval'], )) else: logger.info("There were no val datasets passed") data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( **cfg['AudioToMelSpectrogramPreprocessor']) data_augmentation = nemo_asr.SpectrogramAugmentation( **cfg['SpectrogramAugmentation']) encoder = nemo_asr.JasperEncoder( feat_in=cfg["AudioToMelSpectrogramPreprocessor"]["features"], **cfg['JasperEncoder'], ) if args.encoder_checkpoint is not None and os.path.exists( args.encoder_checkpoint): if cfg['JasperEncoder']['load']: encoder.restore_from(args.encoder_checkpoint, args.local_rank) logger.info(f'Loaded weights for encoder' f' from {args.encoder_checkpoint}') if cfg['JasperEncoder']['freeze']: encoder.freeze() logger.info(f'Freeze encoder weights') connector = nemo_asr.JasperRNNConnector( in_channels=cfg['JasperEncoder']['jasper'][-1]['filters'], out_channels=cfg['DecoderRNN']['hidden_size'], ) decoder = nemo.backends.pytorch.DecoderRNN( voc_size=len(cfg['target']['labels']), bos_id=cfg['target']['bos_id'], **cfg['DecoderRNN'], ) if args.decoder_checkpoint is not None and os.path.exists( args.decoder_checkpoint): if cfg['DecoderRNN']['load']: decoder.restore_from(args.decoder_checkpoint, args.local_rank) logger.info(f'Loaded weights for decoder' f' from {args.decoder_checkpoint}') if cfg['DecoderRNN']['freeze']: decoder.freeze() logger.info(f'Freeze decoder weights') if cfg['decoder']['unfreeze_attn']: for name, param in decoder.attention.named_parameters(): param.requires_grad = True logger.info(f'Unfreeze decoder attn weights') num_data = len(data) batch_size = cfg['optimization']['batch_size'] num_epochs = cfg['optimization']['params']['num_epochs'] steps_per_epoch = int(num_data / (batch_size * num_gpus)) total_steps = num_epochs * steps_per_epoch vsc = ValueSetterCallback tf_callback = ValueSetterCallback( decoder, 'teacher_forcing', policies=[vsc.Policy(vsc.Method.Const(1.0), start=0.0, end=1.0)], total_steps=total_steps, ) seq_loss = nemo.backends.pytorch.SequenceLoss( pad_id=cfg['target']['pad_id'], smoothing_coef=cfg['optimization']['smoothing_coef'], sample_wise=cfg['optimization']['sample_wise'], ) se_callback = ValueSetterCallback( seq_loss, 'smoothing_coef', policies=[ vsc.Policy(vsc.Method.Const(seq_loss.smoothing_coef), start=0.0, end=1.0), ], total_steps=total_steps, ) beam_search = nemo.backends.pytorch.BeamSearch( decoder=decoder, pad_id=cfg['target']['pad_id'], bos_id=cfg['target']['bos_id'], eos_id=cfg['target']['eos_id'], max_len=cfg['target']['max_len'], beam_size=cfg['inference']['beam_size'], ) uf_callback = UnfreezeCallback( [encoder, decoder], start_epoch=cfg['optimization']['start_unfreeze']) saver_callback = nemo.core.ModuleSaverCallback( save_modules_list=[encoder, connector, decoder], folder=args.checkpoint_dir, step_freq=args.eval_freq, ) # Creating DAG audios, audio_lens, transcripts, _ = data() processed_audios, processed_audio_lens = data_preprocessor( input_signal=audios, length=audio_lens) augmented_spec = data_augmentation(input_spec=processed_audios) encoded, _ = encoder(audio_signal=augmented_spec, length=processed_audio_lens) encoded = connector(tensor=encoded) log_probs, _ = decoder(targets=transcripts, encoder_outputs=encoded) train_loss = seq_loss(log_probs=log_probs, targets=transcripts) evals = [] for i, data_eval in enumerate(data_evals): audios, audio_lens, transcripts, _ = data_eval() processed_audios, processed_audio_lens = data_preprocessor( input_signal=audios, length=audio_lens) encoded, _ = encoder(audio_signal=processed_audios, length=processed_audio_lens) encoded = connector(tensor=encoded) log_probs, _ = decoder(targets=transcripts, encoder_outputs=encoded) loss = seq_loss(log_probs=log_probs, targets=transcripts) predictions, aw = beam_search(encoder_outputs=encoded) evals.append(( args.eval_datasets[i], (loss, log_probs, transcripts, predictions, aw), )) # Update config cfg['num_params'] = { 'encoder': encoder.num_weights, 'connector': connector.num_weights, 'decoder': decoder.num_weights, } cfg['num_params']['total'] = sum(cfg['num_params'].values()) cfg['input']['train'] = {'num_data': num_data} cfg['optimization']['steps_per_epoch'] = steps_per_epoch cfg['optimization']['total_steps'] = total_steps return ( (train_loss, evals), cfg, [tf_callback, se_callback, uf_callback, saver_callback], )
def main(): parser = argparse.ArgumentParser(description='Jasper') parser.add_argument("--local_rank", default=None, type=int) parser.add_argument("--batch_size", default=32, type=int) parser.add_argument("--model_config", type=str, required=True) parser.add_argument("--eval_datasets", type=str, required=True) parser.add_argument("--load_dir", type=str, required=True) parser.add_argument("--vocab_file", type=str, required=True) parser.add_argument("--save_logprob", default=None, type=str) parser.add_argument("--lm_path", default=None, type=str) parser.add_argument("--beam_width", default=50, type=int) parser.add_argument("--alpha", default=2.0, type=float) parser.add_argument("--beta", default=1.0, type=float) parser.add_argument("--cutoff_prob", default=0.99, type=float) parser.add_argument("--cutoff_top_n", default=40, type=int) args = parser.parse_args() batch_size = args.batch_size load_dir = args.load_dir if args.local_rank is not None: if args.lm_path: raise NotImplementedError( "Beam search decoder with LM does not currently support evaluation on multi-gpu." ) device = nemo.core.DeviceType.AllGpu else: device = nemo.core.DeviceType.GPU # Instantiate Neural Factory with supported backend neural_factory = nemo.core.NeuralModuleFactory( backend=nemo.core.Backend.PyTorch, local_rank=args.local_rank, optimization_level=nemo.core.Optimization.mxprO1, placement=device, ) if args.local_rank is not None: logging.info('Doing ALL GPU') yaml = YAML(typ="safe") with open(args.model_config) as f: jasper_params = yaml.load(f) vocab = load_vocab(args.vocab_file) sample_rate = jasper_params['sample_rate'] eval_datasets = args.eval_datasets eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"]) eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"]) eval_dl_params["normalize_transcripts"] = False del eval_dl_params["train"] del eval_dl_params["eval"] data_layer = nemo_asr.AudioToTextDataLayer( manifest_filepath=eval_datasets, sample_rate=sample_rate, labels=vocab, batch_size=batch_size, **eval_dl_params, ) n = len(data_layer) logging.info('Evaluating {0} examples'.format(n)) data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( sample_rate=sample_rate, **jasper_params["AudioToMelSpectrogramPreprocessor"], ) jasper_encoder = nemo_asr.JasperEncoder( feat_in=jasper_params["AudioToMelSpectrogramPreprocessor"]["features"], **jasper_params["JasperEncoder"], ) jasper_decoder = nemo_asr.JasperDecoderForCTC( feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"], num_classes=len(vocab), ) greedy_decoder = nemo_asr.GreedyCTCDecoder() if args.lm_path: beam_width = args.beam_width alpha = args.alpha beta = args.beta cutoff_prob = args.cutoff_prob cutoff_top_n = args.cutoff_top_n beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM( vocab=vocab, beam_width=beam_width, alpha=alpha, beta=beta, cutoff_prob=cutoff_prob, cutoff_top_n=cutoff_top_n, lm_path=args.lm_path, num_cpus=max(os.cpu_count(), 1), ) logging.info('================================') logging.info( f"Number of parameters in encoder: {jasper_encoder.num_weights}") logging.info( f"Number of parameters in decoder: {jasper_decoder.num_weights}") logging.info(f"Total number of parameters in model: " f"{jasper_decoder.num_weights + jasper_encoder.num_weights}") logging.info('================================') ( audio_signal_e1, a_sig_length_e1, transcript_e1, transcript_len_e1, ) = data_layer() processed_signal_e1, p_length_e1 = data_preprocessor( input_signal=audio_signal_e1, length=a_sig_length_e1) encoded_e1, encoded_len_e1 = jasper_encoder( audio_signal=processed_signal_e1, length=p_length_e1) log_probs_e1 = jasper_decoder(encoder_output=encoded_e1) predictions_e1 = greedy_decoder(log_probs=log_probs_e1) eval_tensors = [ log_probs_e1, predictions_e1, transcript_e1, transcript_len_e1, encoded_len_e1, ] if args.lm_path: beam_predictions_e1 = beam_search_with_lm( log_probs=log_probs_e1, log_probs_length=encoded_len_e1) eval_tensors.append(beam_predictions_e1) evaluated_tensors = neural_factory.infer( tensors=eval_tensors, checkpoint_dir=load_dir, ) greedy_hypotheses = post_process_predictions(evaluated_tensors[1], vocab) references = post_process_transcripts(evaluated_tensors[2], evaluated_tensors[3], vocab) cer = word_error_rate(hypotheses=greedy_hypotheses, references=references, use_cer=True) logging.info("Greedy CER {:.2f}%".format(cer * 100)) if args.lm_path: beam_hypotheses = [] # Over mini-batch for i in evaluated_tensors[-1]: # Over samples for j in i: beam_hypotheses.append(j[0][1]) cer = word_error_rate(hypotheses=beam_hypotheses, references=references, use_cer=True) logging.info("Beam CER {:.2f}".format(cer * 100)) if args.save_logprob: # Convert logits to list of numpy arrays logprob = [] for i, batch in enumerate(evaluated_tensors[0]): for j in range(batch.shape[0]): logprob.append( batch[j][:evaluated_tensors[4][i][j], :].cpu().numpy()) with open(args.save_logprob, 'wb') as f: pickle.dump(logprob, f, protocol=pickle.HIGHEST_PROTOCOL)
def test_fastspeech(self): neural_factory = nemo.core.NeuralModuleFactory( backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False, ) data_layer = nemo_asr.AudioToTextDataLayer( manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=1, shuffle=False, sample_rate=16000, ) data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( window_size=None, window_stride=None, n_window_size=512, n_window_stride=128, normalize=None, preemph=None, dither=0, mag_power=1.0, pad_value=-11.52, pad_to=0, ) data = data_layer() spec, spec_length = data_preprocessor(input_signal=data.audio_signal, length=data.a_sig_length) # Creates and saves durations as numpy arrays. durs_dir = pathlib.Path('tests/data/asr/durs') durs_dir.mkdir(exist_ok=True) result = neural_factory.infer( [data.transcripts, data.transcript_length, spec_length, spec]) k = -1 for text, text_len, mel_len, mel in zip(result[0], result[1], result[2], result[3]): text = text.cpu().numpy()[0][:text_len.cpu().numpy()[0]] dur = np.zeros(text.shape[0], dtype=np.long) dur_sum = mel_len.cpu().numpy()[0] + 1 # TODO: delete `+1` dur[0] = dur_sum - 4 dur[1] = 4 k += 1 np.save(durs_dir / f'{k}.npy', dur, allow_pickle=False) data_layer = nemo_tts.FastSpeechDataLayer( manifest_filepath=self.manifest_filepath, durs_dir=durs_dir, labels=self.labels, batch_size=4, sample_rate=16000, ) fastspeech = nemo_tts.FastSpeech( decoder_output_size=384, n_mels=64, max_seq_len=2048, word_vec_dim=384, encoder_n_layer=6, encoder_head=2, encoder_conv1d_filter_size=1536, decoder_n_layer=6, decoder_head=2, decoder_conv1d_filter_size=1536, fft_conv1d_kernel=3, fft_conv1d_padding=1, encoder_output_size=384, duration_predictor_filter_size=256, duration_predictor_kernel_size=3, dropout=0.1, alpha=1.0, n_src_vocab=len(self.labels), pad_id=0, ) loss = nemo_tts.FastSpeechLoss() data = data_layer() mel_true, _ = data_preprocessor(input_signal=data.audio, length=data.audio_len) mel_pred, dur_pred = fastspeech( text=data.text, text_pos=data.text_pos, mel_true=mel_true, dur_true=data.dur_true, ) loss_t = loss( mel_true=mel_true, mel_pred=mel_pred, dur_true=data.dur_true, dur_pred=dur_pred, text_pos=data.text_pos, ) callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss_t], print_func=lambda x: logging.info(f'Train Loss: {str(x[0].item())}' ), ) optimizer = neural_factory.get_trainer() optimizer.train( [loss_t], callbacks=[callback], optimizer="sgd", optimization_params={ "num_epochs": 3, "lr": 0.0003 }, )
def create_train_dag( neural_factory, neural_modules, tacotron2_params, train_dataset, batch_size, log_freq, checkpoint_save_freq, cpu_per_dl=1, ): (data_preprocessor, text_embedding, t2_enc, t2_dec, t2_postnet, t2_loss, makegatetarget) = neural_modules train_dl_params = copy.deepcopy(tacotron2_params["AudioToTextDataLayer"]) train_dl_params.update(tacotron2_params["AudioToTextDataLayer"]["train"]) del train_dl_params["train"] del train_dl_params["eval"] data_layer = nemo_asr.AudioToTextDataLayer( manifest_filepath=train_dataset, labels=tacotron2_params['labels'], bos_id=len(tacotron2_params['labels']), eos_id=len(tacotron2_params['labels']) + 1, pad_id=len(tacotron2_params['labels']) + 2, batch_size=batch_size, num_workers=cpu_per_dl, **train_dl_params, ) N = len(data_layer) steps_per_epoch = math.ceil(N / (batch_size * neural_factory.world_size)) logging.info(f'Have {N} examples to train on.') # Train DAG audio, audio_len, transcript, transcript_len = data_layer() spec_target, spec_target_len = data_preprocessor(input_signal=audio, length=audio_len) transcript_embedded = text_embedding(char_phone=transcript) transcript_encoded = t2_enc(char_phone_embeddings=transcript_embedded, embedding_length=transcript_len) mel_decoder, gate, alignments = t2_dec( char_phone_encoded=transcript_encoded, encoded_length=transcript_len, mel_target=spec_target, ) mel_postnet = t2_postnet(mel_input=mel_decoder) gate_target = makegatetarget(mel_target=spec_target, target_len=spec_target_len) loss_t = t2_loss( mel_out=mel_decoder, mel_out_postnet=mel_postnet, gate_out=gate, mel_target=spec_target, gate_target=gate_target, target_len=spec_target_len, seq_len=audio_len, ) # Callbacks needed to print info to console and Tensorboard train_callback = nemo.core.SimpleLossLoggerCallback( tensors=[ loss_t, spec_target, mel_postnet, gate, gate_target, alignments ], print_func=lambda x: logging.info(f"Loss: {x[0].data}"), log_to_tb_func=partial(tacotron2_log_to_tb_func, log_images=True, log_images_freq=log_freq), tb_writer=neural_factory.tb_writer, ) chpt_callback = nemo.core.CheckpointCallback( folder=neural_factory.checkpoint_dir, step_freq=checkpoint_save_freq) callbacks = [train_callback, chpt_callback] return loss_t, callbacks, steps_per_epoch
def create_all_dags(args, neural_factory): yaml = YAML(typ="safe") with open(args.model_config) as f: jasper_params = yaml.load(f) vocab = load_vocab(args.vocab_file) sample_rate = jasper_params['sample_rate'] # Calculate num_workers for dataloader total_cpus = os.cpu_count() cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1) # perturb_config = jasper_params.get('perturb', None) train_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"]) train_dl_params.update(jasper_params["AudioToTextDataLayer"]["train"]) del train_dl_params["train"] del train_dl_params["eval"] train_dl_params["normalize_transcripts"] = False data_layer = nemo_asr.AudioToTextDataLayer( manifest_filepath=args.train_dataset, sample_rate=sample_rate, labels=vocab, batch_size=args.batch_size, num_workers=cpu_per_traindl, **train_dl_params, # normalize_transcripts=False ) N = len(data_layer) steps_per_epoch = int(N / (args.batch_size * args.num_gpus)) nemo.logging.info('Have {0} examples to train on.'.format(N)) data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( sample_rate=sample_rate, **jasper_params["AudioToMelSpectrogramPreprocessor"], ) multiply_batch_config = jasper_params.get('MultiplyBatch', None) if multiply_batch_config: multiply_batch = nemo_asr.MultiplyBatch(**multiply_batch_config) spectr_augment_config = jasper_params.get('SpectrogramAugmentation', None) if spectr_augment_config: data_spectr_augmentation = nemo_asr.SpectrogramAugmentation( **spectr_augment_config) eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"]) eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"]) eval_dl_params["normalize_transcripts"] = False del eval_dl_params["train"] del eval_dl_params["eval"] data_layers_eval = [] if args.eval_datasets: for eval_datasets in args.eval_datasets: data_layer_eval = nemo_asr.AudioToTextDataLayer( manifest_filepath=eval_datasets, sample_rate=sample_rate, labels=vocab, batch_size=args.eval_batch_size, num_workers=cpu_per_traindl, **eval_dl_params, ) data_layers_eval.append(data_layer_eval) else: nemo.logging.warning("There were no val datasets passed") jasper_encoder = nemo_asr.JasperEncoder( feat_in=jasper_params["AudioToMelSpectrogramPreprocessor"]["features"], **jasper_params["JasperEncoder"], ) jasper_decoder = nemo_asr.JasperDecoderForCTC( feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"], num_classes=len(vocab)) ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab)) greedy_decoder = nemo_asr.GreedyCTCDecoder() nemo.logging.info('================================') nemo.logging.info( f"Number of parameters in encoder: {jasper_encoder.num_weights}") nemo.logging.info( f"Number of parameters in decoder: {jasper_decoder.num_weights}") nemo.logging.info( f"Total number of parameters in model: " f"{jasper_decoder.num_weights + jasper_encoder.num_weights}") nemo.logging.info('================================') # Train DAG ( audio_signal_t, a_sig_length_t, transcript_t, transcript_len_t, ) = data_layer() processed_signal_t, p_length_t = data_preprocessor( input_signal=audio_signal_t, length=a_sig_length_t) if multiply_batch_config: ( processed_signal_t, p_length_t, transcript_t, transcript_len_t, ) = multiply_batch( in_x=processed_signal_t, in_x_len=p_length_t, in_y=transcript_t, in_y_len=transcript_len_t, ) if spectr_augment_config: processed_signal_t = data_spectr_augmentation( input_spec=processed_signal_t) encoded_t, encoded_len_t = jasper_encoder(audio_signal=processed_signal_t, length=p_length_t) log_probs_t = jasper_decoder(encoder_output=encoded_t) predictions_t = greedy_decoder(log_probs=log_probs_t) loss_t = ctc_loss( log_probs=log_probs_t, targets=transcript_t, input_length=encoded_len_t, target_length=transcript_len_t, ) # Callbacks needed to print info to console and Tensorboard train_callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss_t, predictions_t, transcript_t, transcript_len_t], print_func=partial(monitor_asr_train_progress, labels=vocab, eval_metric='CER'), step_freq=args.train_eval_freq, get_tb_values=lambda x: [("loss", x[0])], tb_writer=neural_factory.tb_writer, ) chpt_callback = nemo.core.CheckpointCallback( folder=neural_factory.checkpoint_dir, step_freq=args.checkpoint_save_freq, ) callbacks = [train_callback, chpt_callback] # assemble eval DAGs for i, eval_dl in enumerate(data_layers_eval): ( audio_signal_e, a_sig_length_e, transcript_e, transcript_len_e, ) = eval_dl() processed_signal_e, p_length_e = data_preprocessor( input_signal=audio_signal_e, length=a_sig_length_e) encoded_e, encoded_len_e = jasper_encoder( audio_signal=processed_signal_e, length=p_length_e) log_probs_e = jasper_decoder(encoder_output=encoded_e) predictions_e = greedy_decoder(log_probs=log_probs_e) loss_e = ctc_loss( log_probs=log_probs_e, targets=transcript_e, input_length=encoded_len_e, target_length=transcript_len_e, ) # create corresponding eval callback tagname = os.path.basename(args.eval_datasets[i]).split(".")[0] eval_callback = nemo.core.EvaluatorCallback( eval_tensors=[ loss_e, predictions_e, transcript_e, transcript_len_e, ], user_iter_callback=partial(process_evaluation_batch, labels=vocab), user_epochs_done_callback=partial(process_evaluation_epoch, eval_metric='CER', tag=tagname), eval_step=args.eval_freq, tb_writer=neural_factory.tb_writer, ) callbacks.append(eval_callback) return loss_t, callbacks, steps_per_epoch
def create_eval_dags( neural_factory, neural_modules, tacotron2_params, eval_datasets, eval_batch_size, eval_freq, cpu_per_dl=1, ): (data_preprocessor, text_embedding, t2_enc, t2_dec, t2_postnet, t2_loss, makegatetarget) = neural_modules eval_dl_params = copy.deepcopy(tacotron2_params["AudioToTextDataLayer"]) eval_dl_params.update(tacotron2_params["AudioToTextDataLayer"]["eval"]) del eval_dl_params["train"] del eval_dl_params["eval"] callbacks = [] # assemble eval DAGs for eval_dataset in eval_datasets: data_layer_eval = nemo_asr.AudioToTextDataLayer( manifest_filepath=eval_dataset, labels=tacotron2_params['labels'], bos_id=len(tacotron2_params['labels']), eos_id=len(tacotron2_params['labels']) + 1, pad_id=len(tacotron2_params['labels']) + 2, batch_size=eval_batch_size, num_workers=cpu_per_dl, **eval_dl_params, ) audio, audio_len, transcript, transcript_len = data_layer_eval() spec_target, spec_target_len = data_preprocessor(input_signal=audio, length=audio_len) transcript_embedded = text_embedding(char_phone=transcript) transcript_encoded = t2_enc(char_phone_embeddings=transcript_embedded, embedding_length=transcript_len) mel_decoder, gate, alignments = t2_dec( char_phone_encoded=transcript_encoded, encoded_length=transcript_len, mel_target=spec_target, ) mel_postnet = t2_postnet(mel_input=mel_decoder) gate_target = makegatetarget(mel_target=spec_target, target_len=spec_target_len) loss = t2_loss( mel_out=mel_decoder, mel_out_postnet=mel_postnet, gate_out=gate, mel_target=spec_target, gate_target=gate_target, target_len=spec_target_len, seq_len=audio_len, ) # create corresponding eval callback tagname = os.path.basename(eval_dataset).split(".")[0] eval_tensors = [ loss, spec_target, mel_postnet, gate, gate_target, alignments, ] eval_callback = nemo.core.EvaluatorCallback( eval_tensors=eval_tensors, user_iter_callback=tacotron2_process_eval_batch, user_epochs_done_callback=partial(tacotron2_process_final_eval, tag=tagname), tb_writer_func=partial(tacotron2_eval_log_to_tb_func, tag=tagname), eval_step=eval_freq, tb_writer=neural_factory.tb_writer, ) callbacks.append(eval_callback) return callbacks