def wrong(): with open("tests/data/jasper_smaller.yaml") as file: jasper_config = self.yaml.load(file) labels = jasper_config['labels'] data_layer = nemo_asr.AudioToTextDataLayer( manifest_filepath=self.manifest_filepath, labels=labels, batch_size=4) data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( **jasper_config['AudioToMelSpectrogramPreprocessor']) jasper_encoder = nemo_asr.JasperEncoder( feat_in=jasper_config['AudioToMelSpectrogramPreprocessor'] ['features'], **jasper_config['JasperEncoder']) jasper_decoder = nemo_asr.JasperDecoderForCTC( feat_in=1024, num_classes=len(labels)) # DAG definition audio_signal, audio_signal_len, transcript, transcript_len = \ data_layer() processed_signal, processed_signal_len = data_preprocessor( input_signal=audio_signal, length=audio_signal_len) spec_augment = nemo_asr.SpectrogramAugmentation(rect_masks=5) aug_signal = spec_augment(input_spec=processed_signal) encoded, encoded_len = jasper_encoder(audio_signal=aug_signal, length=processed_signal_len) log_probs = jasper_decoder(encoder_output=processed_signal)
def create_all_dags(args, neural_factory): logger = neural_factory.logger yaml = YAML(typ="safe") with open(args.model_config) as f: jasper_params = yaml.load(f) vocab = jasper_params['labels'] sample_rate = jasper_params['sample_rate'] # Calculate num_workers for dataloader total_cpus = os.cpu_count() cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1) # perturb_config = jasper_params.get('perturb', None) train_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"]) train_dl_params.update(jasper_params["AudioToTextDataLayer"]["train"]) del train_dl_params["train"] del train_dl_params["eval"] # del train_dl_params["normalize_transcripts"] data_layer = nemo_asr.AudioToTextDataLayer( manifest_filepath=args.train_dataset, sample_rate=sample_rate, labels=vocab, batch_size=args.batch_size, num_workers=cpu_per_traindl, **train_dl_params, # normalize_transcripts=False ) N = len(data_layer) steps_per_epoch = int( N / (args.batch_size * args.iter_per_step * args.num_gpus)) logger.info('Have {0} examples to train on.'.format(N)) data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( sample_rate=sample_rate, **jasper_params["AudioToMelSpectrogramPreprocessor"]) multiply_batch_config = jasper_params.get('MultiplyBatch', None) if multiply_batch_config: multiply_batch = nemo_asr.MultiplyBatch(**multiply_batch_config) spectr_augment_config = jasper_params.get('SpectrogramAugmentation', None) if spectr_augment_config: data_spectr_augmentation = nemo_asr.SpectrogramAugmentation( **spectr_augment_config) eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"]) eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"]) del eval_dl_params["train"] del eval_dl_params["eval"] data_layers_eval = [] if args.eval_datasets: for eval_datasets in args.eval_datasets: data_layer_eval = nemo_asr.AudioToTextDataLayer( manifest_filepath=eval_datasets, sample_rate=sample_rate, labels=vocab, batch_size=args.eval_batch_size, num_workers=cpu_per_traindl, **eval_dl_params, ) data_layers_eval.append(data_layer_eval) else: neural_factory.logger.info("There were no val datasets passed") jasper_encoder = nemo_asr.JasperEncoder( feat_in=jasper_params["AudioToMelSpectrogramPreprocessor"]["features"], **jasper_params["JasperEncoder"]) jasper_decoder = nemo_asr.JasperDecoderForCTC( feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"], num_classes=len(vocab), factory=neural_factory) ctc_loss = nemo_asr.CTCLossNM( num_classes=len(vocab)) greedy_decoder = nemo_asr.GreedyCTCDecoder() logger.info('================================') logger.info( f"Number of parameters in encoder: {jasper_encoder.num_weights}") logger.info( f"Number of parameters in decoder: {jasper_decoder.num_weights}") logger.info( f"Total number of parameters in decoder: " f"{jasper_decoder.num_weights + jasper_encoder.num_weights}") logger.info('================================') # Train DAG audio_signal_t, a_sig_length_t, \ transcript_t, transcript_len_t = data_layer() processed_signal_t, p_length_t = data_preprocessor( input_signal=audio_signal_t, length=a_sig_length_t) if multiply_batch_config: processed_signal_t, p_length_t, transcript_t, transcript_len_t = \ multiply_batch( in_x=processed_signal_t, in_x_len=p_length_t, in_y=transcript_t, in_y_len=transcript_len_t) if spectr_augment_config: processed_signal_t = data_spectr_augmentation( input_spec=processed_signal_t) encoded_t, encoded_len_t = jasper_encoder( audio_signal=processed_signal_t, length=p_length_t) log_probs_t = jasper_decoder(encoder_output=encoded_t) predictions_t = greedy_decoder(log_probs=log_probs_t) loss_t = ctc_loss( log_probs=log_probs_t, targets=transcript_t, input_length=encoded_len_t, target_length=transcript_len_t) # Callbacks needed to print info to console and Tensorboard train_callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss_t, predictions_t, transcript_t, transcript_len_t], print_func=partial( monitor_asr_train_progress, labels=vocab, logger=logger), get_tb_values=lambda x: [("loss", x[0])], tb_writer=neural_factory.tb_writer, ) chpt_callback = nemo.core.CheckpointCallback( folder=neural_factory.checkpoint_dir, step_freq=args.checkpoint_save_freq) callbacks = [train_callback, chpt_callback] # assemble eval DAGs for i, eval_dl in enumerate(data_layers_eval): audio_signal_e, a_sig_length_e, transcript_e, transcript_len_e = \ eval_dl() processed_signal_e, p_length_e = data_preprocessor( input_signal=audio_signal_e, length=a_sig_length_e) encoded_e, encoded_len_e = jasper_encoder( audio_signal=processed_signal_e, length=p_length_e) log_probs_e = jasper_decoder(encoder_output=encoded_e) predictions_e = greedy_decoder(log_probs=log_probs_e) loss_e = ctc_loss( log_probs=log_probs_e, targets=transcript_e, input_length=encoded_len_e, target_length=transcript_len_e) # create corresponding eval callback tagname = os.path.basename(args.eval_datasets[i]).split(".")[0] eval_callback = nemo.core.EvaluatorCallback( eval_tensors=[loss_e, predictions_e, transcript_e, transcript_len_e], user_iter_callback=partial( process_evaluation_batch, labels=vocab), user_epochs_done_callback=partial( process_evaluation_epoch, tag=tagname, logger=logger), eval_step=args.eval_freq, tb_writer=neural_factory.tb_writer) callbacks.append(eval_callback) return loss_t, callbacks, steps_per_epoch
yaml = YAML(typ="safe") with open("./NeMo/examples/asr/configs/jasper12x1SEP.yaml") as f: jasper_model_definition = yaml.load(f) labels = jasper_model_definition['labels'] # Instantiate neural modules data_layer = nemo_asr.AudioToTextDataLayer(manifest_filepath=train_dataset, labels=labels, batch_size=32) data_layer_val = nemo_asr.AudioToTextDataLayer(manifest_filepath=eval_datasets, labels=labels, batch_size=32, shuffle=False) data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor() spec_augment = nemo_asr.SpectrogramAugmentation(rect_masks=5) jasper_encoder = nemo_asr.JasperEncoder( feat_in=64, **jasper_model_definition['JasperEncoder']) jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len(labels)) ctc_loss = nemo_asr.CTCLossNM(num_classes=len(labels)) greedy_decoder = nemo_asr.GreedyCTCDecoder() # Training DAG (Model) audio_signal, audio_signal_len, transcript, transcript_len = data_layer() processed_signal, processed_signal_len = data_preprocessor( input_signal=audio_signal, length=audio_signal_len) aug_signal = spec_augment(input_spec=processed_signal) encoded, encoded_len = jasper_encoder(audio_signal=aug_signal, length=processed_signal_len)
def create_all_dags(args, neural_factory): ''' creates train and eval dags as well as their callbacks returns train loss tensor and callbacks''' # parse the config files yaml = YAML(typ="safe") with open(args.model_config) as f: quartz_params = yaml.load(f) vocab = quartz_params['labels'] sample_rate = quartz_params['sample_rate'] # Calculate num_workers for dataloader total_cpus = os.cpu_count() cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1) # create data layer for training train_dl_params = copy.deepcopy(quartz_params["AudioToTextDataLayer"]) train_dl_params.update(quartz_params["AudioToTextDataLayer"]["train"]) del train_dl_params["train"] del train_dl_params["eval"] # del train_dl_params["normalize_transcripts"] data_layer_train = nemo_asr.AudioToTextDataLayer( manifest_filepath=args.train_dataset, sample_rate=sample_rate, labels=vocab, batch_size=args.batch_size, num_workers=cpu_per_traindl, **train_dl_params, # normalize_transcripts=False ) N = len(data_layer_train) steps_per_epoch = int( N / (args.batch_size * args.iter_per_step * args.num_gpus)) # create separate data layers for eval # we need separate eval dags for separate eval datasets # but all other modules in these dags will be shared eval_dl_params = copy.deepcopy(quartz_params["AudioToTextDataLayer"]) eval_dl_params.update(quartz_params["AudioToTextDataLayer"]["eval"]) del eval_dl_params["train"] del eval_dl_params["eval"] data_layers_eval = [] if args.eval_datasets: for eval_dataset in args.eval_datasets: data_layer_eval = nemo_asr.AudioToTextDataLayer( manifest_filepath=eval_dataset, sample_rate=sample_rate, labels=vocab, batch_size=args.eval_batch_size, num_workers=cpu_per_traindl, **eval_dl_params, ) data_layers_eval.append(data_layer_eval) else: nemo.logging.warning("There were no val datasets passed") # create shared modules data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( sample_rate=sample_rate, **quartz_params["AudioToMelSpectrogramPreprocessor"]) # (QuartzNet uses the Jasper baseline encoder and decoder) encoder = nemo_asr.JasperEncoder( feat_in=quartz_params["AudioToMelSpectrogramPreprocessor"]["features"], **quartz_params["JasperEncoder"]) decoder = nemo_asr.JasperDecoderForCTC( feat_in=quartz_params["JasperEncoder"]["jasper"][-1]["filters"], num_classes=len(vocab)) ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab)) greedy_decoder = nemo_asr.GreedyCTCDecoder() # create augmentation modules (only used for training) if their configs # are present multiply_batch_config = quartz_params.get('MultiplyBatch', None) if multiply_batch_config: multiply_batch = nemo_asr.MultiplyBatch(**multiply_batch_config) spectr_augment_config = quartz_params.get('SpectrogramAugmentation', None) if spectr_augment_config: data_spectr_augmentation = nemo_asr.SpectrogramAugmentation( **spectr_augment_config) # assemble train DAG audio_signal_t, a_sig_length_t, \ transcript_t, transcript_len_t = data_layer_train() processed_signal_t, p_length_t = data_preprocessor( input_signal=audio_signal_t, length=a_sig_length_t) if multiply_batch_config: processed_signal_t, p_length_t, transcript_t, transcript_len_t = \ multiply_batch( in_x=processed_signal_t, in_x_len=p_length_t, in_y=transcript_t, in_y_len=transcript_len_t) if spectr_augment_config: processed_signal_t = data_spectr_augmentation( input_spec=processed_signal_t) encoded_t, encoded_len_t = encoder(audio_signal=processed_signal_t, length=p_length_t) log_probs_t = decoder(encoder_output=encoded_t) predictions_t = greedy_decoder(log_probs=log_probs_t) loss_t = ctc_loss(log_probs=log_probs_t, targets=transcript_t, input_length=encoded_len_t, target_length=transcript_len_t) # create train callbacks train_callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss_t, predictions_t, transcript_t, transcript_len_t], print_func=partial(monitor_asr_train_progress, labels=vocab), get_tb_values=lambda x: [["loss", x[0]]], tb_writer=neural_factory.tb_writer) callbacks = [train_callback] if args.checkpoint_dir or args.load_dir: chpt_callback = nemo.core.CheckpointCallback( folder=args.checkpoint_dir, load_from_folder=args.load_dir, step_freq=args.checkpoint_save_freq) callbacks.append(chpt_callback) # assemble eval DAGs for i, eval_dl in enumerate(data_layers_eval): audio_signal_e, a_sig_length_e, transcript_e, transcript_len_e = \ eval_dl() processed_signal_e, p_length_e = data_preprocessor( input_signal=audio_signal_e, length=a_sig_length_e) encoded_e, encoded_len_e = encoder(audio_signal=processed_signal_e, length=p_length_e) log_probs_e = decoder(encoder_output=encoded_e) predictions_e = greedy_decoder(log_probs=log_probs_e) loss_e = ctc_loss(log_probs=log_probs_e, targets=transcript_e, input_length=encoded_len_e, target_length=transcript_len_e) # create corresponding eval callback tagname = os.path.basename(args.eval_datasets[i]).split(".")[0] eval_callback = nemo.core.EvaluatorCallback( eval_tensors=[ loss_e, predictions_e, transcript_e, transcript_len_e ], user_iter_callback=partial(process_evaluation_batch, labels=vocab), user_epochs_done_callback=partial(process_evaluation_epoch, tag=tagname), eval_step=args.eval_freq, tb_writer=neural_factory.tb_writer) callbacks.append(eval_callback) return loss_t, callbacks, steps_per_epoch
def test_simple_dags(self): # module instantiation with open("tests/data/jasper_smaller.yaml") as file: jasper_model_definition = self.yaml.load(file) labels = jasper_model_definition['labels'] data_layer = nemo_asr.AudioToTextDataLayer( manifest_filepath=self.manifest_filepath, labels=labels, batch_size=4) data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( **jasper_model_definition['AudioToMelSpectrogramPreprocessor']) jasper_encoder = nemo_asr.JasperEncoder( feat_in=jasper_model_definition[ 'AudioToMelSpectrogramPreprocessor']['features'], **jasper_model_definition['JasperEncoder']) jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len(labels)) ctc_loss = nemo_asr.CTCLossNM(num_classes=len(labels)) greedy_decoder = nemo_asr.GreedyCTCDecoder() # DAG definition audio_signal, audio_signal_len, transcript, transcript_len = \ data_layer() processed_signal, processed_signal_len = data_preprocessor( input_signal=audio_signal, length=audio_signal_len) spec_augment = nemo_asr.SpectrogramAugmentation(rect_masks=5) aug_signal = spec_augment(input_spec=processed_signal) encoded, encoded_len = jasper_encoder(audio_signal=aug_signal, length=processed_signal_len) log_probs = jasper_decoder(encoder_output=encoded) predictions = greedy_decoder(log_probs=log_probs) loss = ctc_loss(log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len) def wrong(): with open("tests/data/jasper_smaller.yaml") as file: jasper_config = self.yaml.load(file) labels = jasper_config['labels'] data_layer = nemo_asr.AudioToTextDataLayer( manifest_filepath=self.manifest_filepath, labels=labels, batch_size=4) data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( **jasper_config['AudioToMelSpectrogramPreprocessor']) jasper_encoder = nemo_asr.JasperEncoder( feat_in=jasper_config['AudioToMelSpectrogramPreprocessor'] ['features'], **jasper_config['JasperEncoder']) jasper_decoder = nemo_asr.JasperDecoderForCTC( feat_in=1024, num_classes=len(labels)) # DAG definition audio_signal, audio_signal_len, transcript, transcript_len = \ data_layer() processed_signal, processed_signal_len = data_preprocessor( input_signal=audio_signal, length=audio_signal_len) spec_augment = nemo_asr.SpectrogramAugmentation(rect_masks=5) aug_signal = spec_augment(input_spec=processed_signal) encoded, encoded_len = jasper_encoder(audio_signal=aug_signal, length=processed_signal_len) log_probs = jasper_decoder(encoder_output=processed_signal) self.assertRaises(NeuralPortNmTensorMismatchError, wrong)
def create_dag(args, cfg, logger, num_gpus): # Defining nodes data = nemo_asr.AudioToTextDataLayer( manifest_filepath=args.train_dataset, labels=cfg['target']['labels'], batch_size=cfg['optimization']['batch_size'], eos_id=cfg['target']['eos_id'], **cfg['AudioToTextDataLayer']['train']) data_evals = [] if args.eval_datasets: for val_path in args.eval_datasets: data_evals.append( nemo_asr.AudioToTextDataLayer( manifest_filepath=val_path, labels=cfg['target']['labels'], batch_size=cfg['inference']['batch_size'], eos_id=cfg['target']['eos_id'], **cfg['AudioToTextDataLayer']['eval'])) else: logger.info("There were no val datasets passed") data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( **cfg['AudioToMelSpectrogramPreprocessor']) data_augmentation = nemo_asr.SpectrogramAugmentation( **cfg['SpectrogramAugmentation']) encoder = nemo_asr.JasperEncoder( feat_in=cfg["AudioToMelSpectrogramPreprocessor"]["features"], **cfg['JasperEncoder']) if args.encoder_checkpoint is not None \ and os.path.exists(args.encoder_checkpoint): if cfg['JasperEncoder']['load']: encoder.restore_from(args.encoder_checkpoint, args.local_rank) logger.info(f'Loaded weights for encoder' f' from {args.encoder_checkpoint}') if cfg['JasperEncoder']['freeze']: encoder.freeze() logger.info(f'Freeze encoder weights') connector = nemo_asr.JasperRNNConnector( in_channels=cfg['JasperEncoder']['jasper'][-1]['filters'], out_channels=cfg['DecoderRNN']['hidden_size']) decoder = nemo.backends.pytorch.DecoderRNN(voc_size=len( cfg['target']['labels']), bos_id=cfg['target']['bos_id'], **cfg['DecoderRNN']) if args.decoder_checkpoint is not None \ and os.path.exists(args.decoder_checkpoint): if cfg['DecoderRNN']['load']: decoder.restore_from(args.decoder_checkpoint, args.local_rank) logger.info(f'Loaded weights for decoder' f' from {args.decoder_checkpoint}') if cfg['DecoderRNN']['freeze']: decoder.freeze() logger.info(f'Freeze decoder weights') if cfg['decoder']['unfreeze_attn']: for name, param in decoder.attention.named_parameters(): param.requires_grad = True logger.info(f'Unfreeze decoder attn weights') num_data = len(data) batch_size = cfg['optimization']['batch_size'] num_epochs = cfg['optimization']['params']['num_epochs'] steps_per_epoch = int(num_data / (batch_size * num_gpus)) total_steps = num_epochs * steps_per_epoch vsc = ValueSetterCallback tf_callback = ValueSetterCallback( decoder, 'teacher_forcing', policies=[vsc.Policy(vsc.Method.Const(1.0), start=0.0, end=1.0)], total_steps=total_steps) seq_loss = nemo.backends.pytorch.SequenceLoss( pad_id=cfg['target']['pad_id'], smoothing_coef=cfg['optimization']['smoothing_coef'], sample_wise=cfg['optimization']['sample_wise']) se_callback = ValueSetterCallback(seq_loss, 'smoothing_coef', policies=[ vsc.Policy(vsc.Method.Const( seq_loss.smoothing_coef), start=0.0, end=1.0), ], total_steps=total_steps) beam_search = nemo.backends.pytorch.BeamSearch( decoder=decoder, pad_id=cfg['target']['pad_id'], bos_id=cfg['target']['bos_id'], eos_id=cfg['target']['eos_id'], max_len=cfg['target']['max_len'], beam_size=cfg['inference']['beam_size']) uf_callback = UnfreezeCallback( [encoder, decoder], start_epoch=cfg['optimization']['start_unfreeze']) saver_callback = nemo.core.ModuleSaverCallback( save_modules_list=[encoder, connector, decoder], folder=args.checkpoint_dir, step_freq=args.eval_freq) # Creating DAG audios, audio_lens, transcripts, _ = data() processed_audios, processed_audio_lens = data_preprocessor( input_signal=audios, length=audio_lens) augmented_spec = data_augmentation(input_spec=processed_audios) encoded, _ = encoder(audio_signal=augmented_spec, length=processed_audio_lens) encoded = connector(tensor=encoded) log_probs, _ = decoder(targets=transcripts, encoder_outputs=encoded) train_loss = seq_loss(log_probs=log_probs, targets=transcripts) evals = [] for i, data_eval in enumerate(data_evals): audios, audio_lens, transcripts, _ = data_eval() processed_audios, processed_audio_lens = data_preprocessor( input_signal=audios, length=audio_lens) encoded, _ = encoder(audio_signal=processed_audios, length=processed_audio_lens) encoded = connector(tensor=encoded) log_probs, _ = decoder(targets=transcripts, encoder_outputs=encoded) loss = seq_loss(log_probs=log_probs, targets=transcripts) predictions, aw = beam_search(encoder_outputs=encoded) evals.append((args.eval_datasets[i], (loss, log_probs, transcripts, predictions, aw))) # Update config cfg['num_params'] = { 'encoder': encoder.num_weights, 'connector': connector.num_weights, 'decoder': decoder.num_weights } cfg['num_params']['total'] = sum(cfg['num_params'].values()) cfg['input']['train'] = {'num_data': num_data} cfg['optimization']['steps_per_epoch'] = steps_per_epoch cfg['optimization']['total_steps'] = total_steps return (train_loss, evals), cfg, [ tf_callback, se_callback, uf_callback, saver_callback ]