def create_good_preprocessor_1(): nemo_asr.AudioToMelSpectrogramPreprocessor(window_size=0.02, window_stride=0.01)
def test_stft_conv(self): with open( os.path.abspath( os.path.join(os.path.dirname(__file__), "../data/jasper_smaller.yaml"))) as file: jasper_model_definition = self.yaml.load(file) dl = nemo_asr.AudioToTextDataLayer( featurizer_config=self.featurizer_config, manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=4, ) pre_process_params = { 'int_values': False, 'frame_splicing': 1, 'features': 64, 'window_size': 0.02, 'n_fft': 512, 'dither': 1e-05, 'window': 'hann', 'sample_rate': 16000, 'normalize': 'per_feature', 'window_stride': 0.01, 'stft_conv': True, } preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor( **pre_process_params) jasper_encoder = nemo_asr.JasperEncoder( feat_in=jasper_model_definition[ 'AudioToMelSpectrogramPreprocessor']['features'], **jasper_model_definition['JasperEncoder'], ) jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len( self.labels)) ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels)) # DAG audio_signal, a_sig_length, transcript, transcript_len = dl() processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length) encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length) # logging.info(jasper_encoder) log_probs = jasper_decoder(encoder_output=encoded) loss = ctc_loss( log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len, ) callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss], print_func=lambda x: logging.info(str(x[0].item()))) # Instantiate an optimizer to perform `train` action neural_factory = nemo.core.NeuralModuleFactory( backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False, ) optimizer = neural_factory.get_trainer() optimizer.train( [loss], callbacks=[callback], optimizer="sgd", optimization_params={ "num_epochs": 10, "lr": 0.0003 }, )
def create_broken_preprocessor_3(): nemo_asr.AudioToMelSpectrogramPreprocessor(n_window_stride=2)
def create_broken_preprocessor_1(): nemo_asr.AudioToMelSpectrogramPreprocessor(window_size=2, n_window_size=2)
def create_all_dags(args, neural_factory): ''' creates train and eval dags as well as their callbacks returns train loss tensor and callbacks''' # parse the config files yaml = YAML(typ="safe") with open(args.model_config) as f: spkr_params = yaml.load(f) sample_rate = spkr_params['sample_rate'] # Calculate num_workers for dataloader total_cpus = os.cpu_count() cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1) # create separate data layers for eval # we need separate eval dags for separate eval datasets # but all other modules in these dags will be shared eval_dl_params = copy.deepcopy(spkr_params["AudioToSpeechLabelDataLayer"]) eval_dl_params.update(spkr_params["AudioToSpeechLabelDataLayer"]["eval"]) del eval_dl_params["train"] del eval_dl_params["eval"] eval_dl_params['shuffle'] = False # To grab the file names without changing data_layer data_layer_test = nemo_asr.AudioToSpeechLabelDataLayer( manifest_filepath=args.eval_datasets[0], labels=None, batch_size=args.batch_size, num_workers=cpu_per_traindl, **eval_dl_params, # normalize_transcripts=False ) # create shared modules data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( sample_rate=sample_rate, **spkr_params["AudioToMelSpectrogramPreprocessor"], ) # (QuartzNet uses the Jasper baseline encoder and decoder) encoder = nemo_asr.JasperEncoder(**spkr_params["JasperEncoder"],) decoder = nemo_asr.JasperDecoderForSpkrClass( feat_in=spkr_params['JasperEncoder']['jasper'][-1]['filters'], num_classes=254, emb_sizes=spkr_params['JasperDecoderForSpkrClass']['emb_sizes'].split(','), pool_mode=spkr_params["JasperDecoderForSpkrClass"]['pool_mode'], ) # --- Assemble Validation DAG --- # audio_signal_test, audio_len_test, label_test, _ = data_layer_test() processed_signal_test, processed_len_test = data_preprocessor( input_signal=audio_signal_test, length=audio_len_test ) encoded_test, _ = encoder(audio_signal=processed_signal_test, length=processed_len_test) _, embeddings = decoder(encoder_output=encoded_test) return embeddings, label_test
# Set this to True to enable beam search decoder ENABLE_NGRAM = False # This is only necessary if ENABLE_NGRAM = True. Otherwise, set to empty string LM_PATH = "<PATH_TO_KENLM_BINARY>" # Read model YAML yaml = YAML(typ="safe") with open(MODEL_YAML) as f: jasper_model_definition = yaml.load(f) labels = jasper_model_definition['labels'] # Instantiate necessary Neural Modules # Note that data layer is missing from here neural_factory = nemo.core.NeuralModuleFactory( placement=nemo.core.DeviceType.GPU, backend=nemo.core.Backend.PyTorch) data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( factory=neural_factory) jasper_encoder = nemo_asr.JasperEncoder( jasper=jasper_model_definition['JasperEncoder']['jasper'], activation=jasper_model_definition['JasperEncoder']['activation'], feat_in=jasper_model_definition['AudioToMelSpectrogramPreprocessor'] ['features'], ) jasper_encoder.restore_from(CHECKPOINT_ENCODER, local_rank=0) jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len(labels)) jasper_decoder.restore_from(CHECKPOINT_DECODER, local_rank=0) greedy_decoder = nemo_asr.GreedyCTCDecoder() if ENABLE_NGRAM and os.path.isfile(LM_PATH): beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM( vocab=labels,
def create_all_dags(args, neural_factory): yaml = YAML(typ="safe") with open(args.model_config) as f: jasper_params = yaml.load(f) vocab = jasper_params['labels'] sample_rate = jasper_params['sample_rate'] # Calculate num_workers for dataloader total_cpus = os.cpu_count() cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1) # perturb_config = jasper_params.get('perturb', None) train_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"]) train_dl_params.update(jasper_params["AudioToTextDataLayer"]["train"]) del train_dl_params["train"] del train_dl_params["eval"] # del train_dl_params["normalize_transcripts"] data_layer = nemo_asr.AudioToTextDataLayer( manifest_filepath=args.train_dataset, sample_rate=sample_rate, labels=vocab, batch_size=args.batch_size, num_workers=cpu_per_traindl, **train_dl_params, # normalize_transcripts=False ) N = len(data_layer) steps_per_epoch = math.ceil(N / (args.batch_size * args.iter_per_step * args.num_gpus)) nemo.logging.info('Have {0} examples to train on.'.format(N)) data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( sample_rate=sample_rate, **jasper_params["AudioToMelSpectrogramPreprocessor"], ) multiply_batch_config = jasper_params.get('MultiplyBatch', None) if multiply_batch_config: multiply_batch = nemo_asr.MultiplyBatch(**multiply_batch_config) spectr_augment_config = jasper_params.get('SpectrogramAugmentation', None) if spectr_augment_config: data_spectr_augmentation = nemo_asr.SpectrogramAugmentation(**spectr_augment_config) eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"]) eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"]) del eval_dl_params["train"] del eval_dl_params["eval"] data_layers_eval = [] if args.eval_datasets: for eval_datasets in args.eval_datasets: data_layer_eval = nemo_asr.AudioToTextDataLayer( manifest_filepath=eval_datasets, sample_rate=sample_rate, labels=vocab, batch_size=args.eval_batch_size, num_workers=cpu_per_traindl, **eval_dl_params, ) data_layers_eval.append(data_layer_eval) else: nemo.logging.warning("There were no val datasets passed") jasper_encoder = nemo_asr.JasperEncoder( feat_in=jasper_params["AudioToMelSpectrogramPreprocessor"]["features"], **jasper_params["JasperEncoder"], ) jasper_decoder = nemo_asr.JasperDecoderForCTC( feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"], num_classes=len(vocab) ) ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab)) greedy_decoder = nemo_asr.GreedyCTCDecoder() nemo.logging.info('================================') nemo.logging.info(f"Number of parameters in encoder: {jasper_encoder.num_weights}") nemo.logging.info(f"Number of parameters in decoder: {jasper_decoder.num_weights}") nemo.logging.info( f"Total number of parameters in model: " f"{jasper_decoder.num_weights + jasper_encoder.num_weights}" ) nemo.logging.info('================================') # Train DAG (audio_signal_t, a_sig_length_t, transcript_t, transcript_len_t,) = data_layer() processed_signal_t, p_length_t = data_preprocessor(input_signal=audio_signal_t, length=a_sig_length_t) if multiply_batch_config: (processed_signal_t, p_length_t, transcript_t, transcript_len_t,) = multiply_batch( in_x=processed_signal_t, in_x_len=p_length_t, in_y=transcript_t, in_y_len=transcript_len_t, ) if spectr_augment_config: processed_signal_t = data_spectr_augmentation(input_spec=processed_signal_t) encoded_t, encoded_len_t = jasper_encoder(audio_signal=processed_signal_t, length=p_length_t) log_probs_t = jasper_decoder(encoder_output=encoded_t) predictions_t = greedy_decoder(log_probs=log_probs_t) loss_t = ctc_loss( log_probs=log_probs_t, targets=transcript_t, input_length=encoded_len_t, target_length=transcript_len_t, ) # Callbacks needed to print info to console and Tensorboard train_callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss_t, predictions_t, transcript_t, transcript_len_t], print_func=partial(monitor_asr_train_progress, labels=vocab), get_tb_values=lambda x: [("loss", x[0])], tb_writer=neural_factory.tb_writer, ) chpt_callback = nemo.core.CheckpointCallback( folder=neural_factory.checkpoint_dir, load_from_folder=args.load_dir, step_freq=args.checkpoint_save_freq, ) callbacks = [train_callback, chpt_callback] # assemble eval DAGs for i, eval_dl in enumerate(data_layers_eval): (audio_signal_e, a_sig_length_e, transcript_e, transcript_len_e,) = eval_dl() processed_signal_e, p_length_e = data_preprocessor(input_signal=audio_signal_e, length=a_sig_length_e) encoded_e, encoded_len_e = jasper_encoder(audio_signal=processed_signal_e, length=p_length_e) log_probs_e = jasper_decoder(encoder_output=encoded_e) predictions_e = greedy_decoder(log_probs=log_probs_e) loss_e = ctc_loss( log_probs=log_probs_e, targets=transcript_e, input_length=encoded_len_e, target_length=transcript_len_e, ) # create corresponding eval callback tagname = os.path.basename(args.eval_datasets[i]).split(".")[0] eval_callback = nemo.core.EvaluatorCallback( eval_tensors=[loss_e, predictions_e, transcript_e, transcript_len_e,], user_iter_callback=partial(process_evaluation_batch, labels=vocab), user_epochs_done_callback=partial(process_evaluation_epoch, tag=tagname), eval_step=args.eval_freq, tb_writer=neural_factory.tb_writer, ) callbacks.append(eval_callback) return loss_t, callbacks, steps_per_epoch