def wrong(): with open("tests/data/jasper_smaller.yaml") as file: jasper_config = self.yaml.load(file) labels = jasper_config['labels'] data_layer = nemo_asr.AudioToTextDataLayer( manifest_filepath=self.manifest_filepath, labels=labels, batch_size=4, ) data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( **jasper_config['AudioToMelSpectrogramPreprocessor']) jasper_encoder = nemo_asr.JasperEncoder( feat_in=jasper_config['AudioToMelSpectrogramPreprocessor'] ['features'], **jasper_config['JasperEncoder'], ) jasper_decoder = nemo_asr.JasperDecoderForCTC( feat_in=1024, num_classes=len(labels)) # DAG definition ( audio_signal, audio_signal_len, transcript, transcript_len, ) = data_layer() processed_signal, processed_signal_len = data_preprocessor( input_signal=audio_signal, length=audio_signal_len) spec_augment = nemo_asr.SpectrogramAugmentation(rect_masks=5) aug_signal = spec_augment(input_spec=processed_signal) encoded, encoded_len = jasper_encoder(audio_signal=aug_signal, length=processed_signal_len) log_probs = jasper_decoder(encoder_output=processed_signal)
def test_stft_conv(self): with open( os.path.abspath( os.path.join( os.path.dirname(__file__), "../data/quartznet_speech_recognition.yaml"))) as file: jasper_model_definition = self.yaml.load(file) dl = nemo_asr.AudioToSpeechLabelDataLayer( manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=2, ) pre_process_params = { 'frame_splicing': 1, 'features': 64, 'window_size': 0.02, 'n_fft': 512, 'dither': 1e-05, 'window': 'hann', 'sample_rate': 16000, 'normalize': 'per_feature', 'window_stride': 0.01, 'stft_conv': True, } preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor( **pre_process_params) jasper_encoder = nemo_asr.JasperEncoder( **jasper_model_definition['JasperEncoder'], ) jasper_decoder = nemo_asr.JasperDecoderForClassification( feat_in=jasper_model_definition['JasperEncoder']['jasper'][-1] ['filters'], num_classes=len(self.labels)) ce_loss = nemo_asr.CrossEntropyLossNM() # DAG audio_signal, a_sig_length, targets, targets_len = dl() processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length) encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length) # logging.info(jasper_encoder) logits = jasper_decoder(encoder_output=encoded) loss = ce_loss(logits=logits, labels=targets) callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss], print_func=lambda x: logging.info(str(x[0].item()))) # Instantiate an optimizer to perform `train` action optimizer = nemo.backends.pytorch.actions.PtActions() optimizer.train( [loss], callbacks=[callback], optimizer="sgd", optimization_params={ "num_epochs": 10, "lr": 0.0003 }, )
def test_jasper_eval(self): with open( os.path.abspath( os.path.join( os.path.dirname(__file__), "../data/quartznet_speech_recognition.yaml"))) as file: jasper_model_definition = self.yaml.load(file) dl = nemo_asr.AudioToSpeechLabelDataLayer( manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=2, ) pre_process_params = { 'frame_splicing': 1, 'features': 64, 'window_size': 0.02, 'n_fft': 512, 'dither': 1e-05, 'window': 'hann', 'sample_rate': 16000, 'normalize': 'per_feature', 'window_stride': 0.01, } preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor( **pre_process_params) jasper_encoder = nemo_asr.JasperEncoder( **jasper_model_definition['JasperEncoder'], ) jasper_decoder = nemo_asr.JasperDecoderForClassification( feat_in=jasper_model_definition['JasperEncoder']['jasper'][-1] ['filters'], num_classes=len(self.labels)) ce_loss = nemo_asr.CrossEntropyLossNM() # DAG audio_signal, a_sig_length, targets, targets_len = dl() processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length) encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length) # logging.info(jasper_encoder) logits = jasper_decoder(encoder_output=encoded) loss = ce_loss( logits=logits, labels=targets, ) from nemo.collections.asr.helpers import ( process_classification_evaluation_batch, process_classification_evaluation_epoch, ) eval_callback = nemo.core.EvaluatorCallback( eval_tensors=[loss, logits, targets], user_iter_callback=lambda x, y: process_classification_evaluation_batch(x, y, top_k=[1]), user_epochs_done_callback=process_classification_evaluation_epoch, ) # Instantiate an optimizer to perform `train` action self.nf.eval(callbacks=[eval_callback])
def __init__( self, n_vocab: int, d_char: int, pad_id: int, jasper_kwargs: Dict[str, Any], d_out: int, d_speaker_emb: Optional[int] = None, d_speaker_x: Optional[int] = None, d_speaker_o: Optional[int] = None, pad16: bool = False, poly_span: bool = False, doubling: bool = False, ): """Creates TalkNet backbone instance. Args: n_vocab: Size of input vocabulary. d_char: Dimension of char embedding. pad_id: Id of padding symbol. jasper_kwargs: Kwargs to instantiate QN encoder. d_out: Dimension of output. d_speaker_emb: Dimension of speaker embedding. d_speaker_x: Dimension of pre speaker embedding. d_speaker_o: Dimension of post speaker embedding. pad16: True if pad tensors to 16. poly_span: True if assign polynomial span embeddings for blanks. doubling: True if using mel channels doubling trick. """ super().__init__() # Embedding for input text self.text_emb = nn.Embedding(n_vocab, d_char, padding_idx=pad_id).to(self._device) self.text_emb.weight.data.uniform_(-1, 1) # PolySpan self.ps = PolySpanEmb(self.text_emb) self._poly_span = poly_span # Embedding for speaker if d_speaker_emb is not None: self.speaker_in = nn.Linear(d_speaker_emb, d_speaker_x).to(self._device) self.speaker_out = nn.Linear(d_speaker_emb, d_speaker_o).to(self._device) else: self.speaker_in, self.speaker_out = None, None jasper_params = jasper_kwargs['jasper'] d_enc_out = jasper_params[-1]["filters"] d_x = d_char + (int(d_speaker_x or 0) if d_speaker_emb else 0) self.jasper = nemo_asr.JasperEncoder(feat_in=d_x, **jasper_kwargs).to(self._device) d_o = d_enc_out + (int(d_speaker_o or 0) if d_speaker_emb else 0) self.out = nn.Conv1d(d_o, d_out * (1 + int(doubling)), kernel_size=1, bias=True).to(self._device) self._pad16 = pad16 self._doubling = doubling
def test_freeze_unfreeze_TrainableNM(self): path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../data/jasper_smaller.yaml")) with open(path) as file: jasper_model_definition = self.yaml.load(file) dl = nemo_asr.AudioToTextDataLayer( # featurizer_config=self.featurizer_config, manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=4, ) pre_process_params = { #'int_values': False, 'frame_splicing': 1, 'features': 64, 'window_size': 0.02, 'n_fft': 512, 'dither': 1e-05, 'window': 'hann', 'sample_rate': 16000, 'normalize': 'per_feature', 'window_stride': 0.01, } preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(**pre_process_params) jasper_encoder = nemo_asr.JasperEncoder( feat_in=jasper_model_definition['AudioToMelSpectrogramPreprocessor']['features'], **jasper_model_definition['JasperEncoder'], ) jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len(self.labels)) ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels)) jasper_encoder.freeze() jasper_encoder.unfreeze(set(['encoder.4.mconv.0.conv.weight'])) frozen_weight = jasper_encoder.encoder[1].mconv[0].conv.weight.detach().cpu().numpy() unfrozen_weight = jasper_encoder.encoder[4].mconv[0].conv.weight.detach().cpu().numpy() # jasper_decoder.unfreeze() # DAG audio_signal, a_sig_length, transcript, transcript_len = dl() processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length) encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length) # logging.info(jasper_encoder) log_probs = jasper_decoder(encoder_output=encoded) loss = ctc_loss( log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len, ) callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss], print_func=lambda x: logging.info(f'Train Loss: {str(x[0].item())}'), ) optimizer = self.nf.get_trainer() optimizer.train( [loss], callbacks=[callback], optimizer="sgd", optimization_params={"max_steps": 5, "lr": 0.0003}, ) new_frozen_weight = jasper_encoder.encoder[1].mconv[0].conv.weight.data new_unfrozen_weight = jasper_encoder.encoder[4].mconv[0].conv.weight.data self.assertTrue(np.array_equal(frozen_weight, new_frozen_weight.detach().cpu().numpy())) self.assertFalse(np.array_equal(unfrozen_weight, new_unfrozen_weight.detach().cpu().numpy()))
def test_asr_with_zero_ds(self): logging.info("Testing ASR NMs with ZeroDS and without pre-processing") path = os.path.abspath(os.path.join(os.path.dirname(__file__), "../data/jasper_smaller.yaml")) with open(path) as file: jasper_model_definition = self.yaml.load(file) dl = nemo.backends.pytorch.common.ZerosDataLayer( size=100, dtype=torch.FloatTensor, batch_size=4, output_ports={ # "processed_signal": NeuralType( # { # 0: AxisType(BatchTag), # 1: AxisType(SpectrogramSignalTag, dim=64), # 2: AxisType(ProcessedTimeTag, dim=64), # } # ), # "processed_length": NeuralType({0: AxisType(BatchTag)}), # "transcript": NeuralType({0: AxisType(BatchTag), 1: AxisType(TimeTag, dim=64)}), # "transcript_length": NeuralType({0: AxisType(BatchTag)}), "processed_signal": NeuralType( (AxisType(AxisKind.Batch), AxisType(AxisKind.Dimension, 64), AxisType(AxisKind.Time, 64)), SpectrogramType(), ), "processed_length": NeuralType(tuple('B'), LengthsType()), "transcript": NeuralType((AxisType(AxisKind.Batch), AxisType(AxisKind.Time, 64)), LabelsType()), "transcript_length": NeuralType(tuple('B'), LengthsType()), }, ) jasper_encoder = nemo_asr.JasperEncoder( feat_in=jasper_model_definition['AudioToMelSpectrogramPreprocessor']['features'], **jasper_model_definition["JasperEncoder"], ) jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len(self.labels)) ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels)) # DAG processed_signal, p_length, transcript, transcript_len = dl() encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length) # logging.info(jasper_encoder) log_probs = jasper_decoder(encoder_output=encoded) loss = ctc_loss( log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len, ) callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss], print_func=lambda x: logging.info(f'Train Loss: {str(x[0].item())}'), ) # Instantiate an optimizer to perform `train` action self.nf.train( [loss], callbacks=[callback], optimization_params={"num_epochs": 2, "lr": 0.0003}, optimizer="sgd", )
def test_quartznet_vad_training(self): """Integtaion test that instantiates a small QuartzNet model for vad and tests training with the sample vad data. Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss at the first step. """ with open(os.path.abspath(os.path.join(os.path.dirname(__file__), "../data/quartznet_vad.yaml"))) as file: jasper_model_definition = self.yaml.load(file) dl = nemo_asr.AudioToSpeechLabelDataLayer( # featurizer_config=self.featurizer_config, manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=6, ) pre_process_params = pre_process_params = { 'frame_splicing': 1, 'features': 64, 'window_size': 0.02, 'n_fft': 512, 'dither': 1e-05, 'window': 'hann', 'sample_rate': 16000, 'normalize': 'per_feature', 'window_stride': 0.01, } # preprocessing = nemo_asr.AudioToMFCCPreprocessor(**pre_process_params) preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(**pre_process_params) jasper_encoder = nemo_asr.JasperEncoder(**jasper_model_definition['JasperEncoder']) jasper_decoder = nemo_asr.JasperDecoderForClassification( feat_in=jasper_model_definition['JasperEncoder']['jasper'][-1]['filters'], num_classes=len(self.labels) ) ce_loss = nemo_asr.CrossEntropyLossNM() # DAG audio_signal, a_sig_length, targets, targets_len = dl() processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length) encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length) # logging.info(jasper_encoder) log_probs = jasper_decoder(encoder_output=encoded) loss = ce_loss(logits=log_probs, labels=targets) loss_list = [] callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss], print_func=partial(self.print_and_log_loss, loss_log_list=loss_list), step_freq=1 ) self.nf.train( [loss], callbacks=[callback], optimizer="sgd", optimization_params={"max_steps": 3, "lr": 0.003}, ) self.nf.reset_trainer() # Assert that training loss went down assert loss_list[-1] < loss_list[0]
def test_freeze_unfreeze_TrainableNM(self): with open("tests/data/jasper_smaller.yaml") as file: jasper_model_definition = self.yaml.load(file) dl = nemo_asr.AudioToTextDataLayer( featurizer_config=self.featurizer_config, manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=4, ) pre_process_params = { 'int_values': False, 'frame_splicing': 1, 'features': 64, 'window_size': 0.02, 'n_fft': 512, 'dither': 1e-05, 'window': 'hann', 'sample_rate': 16000, 'normalize': 'per_feature', 'window_stride': 0.01, } preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor(**pre_process_params) jasper_encoder = nemo_asr.JasperEncoder( feat_in=jasper_model_definition['AudioToMelSpectrogramPreprocessor']['features'], **jasper_model_definition['JasperEncoder'], ) jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len(self.labels)) ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels)) jasper_encoder.freeze() jasper_encoder.unfreeze(set(['encoder.4.conv.1.weight'])) jasper_decoder.unfreeze() # DAG audio_signal, a_sig_length, transcript, transcript_len = dl() processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length) encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length) # print(jasper_encoder) log_probs = jasper_decoder(encoder_output=encoded) loss = ctc_loss( log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len, ) callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss], print_func=lambda x: print(f'Train Loss: {str(x[0].item())}'), ) # Instantiate an optimizer to perform `train` action neural_factory = nemo.core.NeuralModuleFactory( backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False, ) optimizer = neural_factory.get_trainer() optimizer.train( [loss], callbacks=[callback], optimizer="sgd", optimization_params={"num_epochs": 2, "lr": 0.0003}, )
def test_quartz_encoder(self): with open("tests/data/quartznet_test.yaml") as file: yaml = YAML(typ="safe") quartz_model_definition = yaml.load(file) jasper_encoder = nemo_asr.JasperEncoder( feat_in=quartz_model_definition['AudioToMelSpectrogramPreprocessor']['features'], **quartz_model_definition['JasperEncoder'] ) self.__test_export_route_all( module=jasper_encoder, out_name="quartz_encoder", input_example=(torch.randn(16, 64, 256).cuda(), torch.randint(20, (16,)).cuda()), )
def test_jasper_encoder(self): with open("tests/data/jasper_smaller.yaml") as file: yaml = YAML(typ="safe") jasper_model_definition = yaml.load(file) jasper_encoder = nemo_asr.JasperEncoder( conv_mask=False, feat_in=jasper_model_definition['AudioToMelSpectrogramPreprocessor']['features'], **jasper_model_definition['JasperEncoder'] ) self.__test_export_route_all( module=jasper_encoder, out_name="jasper_encoder", input_example=(torch.randn(16, 64, 256).cuda(), torch.randn(256).cuda()), )
def test_quartz_encoder(self, df_type): with open("tests/data/quartznet_test.yaml") as file: yaml = YAML(typ="safe") quartz_model_definition = yaml.load(file) del quartz_model_definition['JasperEncoder']['conv_mask'] jasper_encoder = nemo_asr.JasperEncoder( conv_mask=False, feat_in=quartz_model_definition[ 'AudioToMelSpectrogramPreprocessor']['features'], **quartz_model_definition['JasperEncoder'], ) self.__test_export_route( module=jasper_encoder, out_name="quartz_encoder", mode=df_type, input_example=torch.randn(16, 64, 256).cuda(), )
def __init__(self, model_yaml, encoder_checkpoint, decoder_checkpoint, language_model=None): super(JasperASR, self).__init__() # Read model YAML yaml = YAML(typ="safe") with open(model_yaml) as f: jasper_model_definition = yaml.load(f) self.neural_factory = nemo.core.NeuralModuleFactory( placement=nemo.core.DeviceType.GPU, backend=nemo.core.Backend.PyTorch) self.labels = jasper_model_definition["labels"] self.data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor() self.jasper_encoder = nemo_asr.JasperEncoder( jasper=jasper_model_definition["JasperEncoder"]["jasper"], activation=jasper_model_definition["JasperEncoder"]["activation"], feat_in=jasper_model_definition[ "AudioToMelSpectrogramPreprocessor"]["features"], ) self.jasper_encoder.restore_from(encoder_checkpoint, local_rank=0) self.jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len( self.labels)) self.jasper_decoder.restore_from(decoder_checkpoint, local_rank=0) self.greedy_decoder = nemo_asr.GreedyCTCDecoder() self.beam_search_with_lm = None if language_model: self.beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM( vocab=self.labels, beam_width=64, alpha=2.0, beta=1.0, lm_path=language_model, num_cpus=max(os.cpu_count(), 1), )
def main( config_file, nn_encoder, nn_decoder, nn_onnx_encoder, nn_onnx_decoder, pre_v09_model=False, batch_size=1, time_steps=256, ): yaml = YAML(typ="safe") logging.info("Loading config file...") with open(config_file) as f: jasper_model_definition = yaml.load(f) logging.info("Determining model shape...") if 'AudioPreprocessing' in jasper_model_definition: num_encoder_input_features = jasper_model_definition['AudioPreprocessing']['features'] elif 'AudioToMelSpectrogramPreprocessor' in jasper_model_definition: num_encoder_input_features = jasper_model_definition['AudioToMelSpectrogramPreprocessor']['features'] else: num_encoder_input_features = 64 num_decoder_input_features = jasper_model_definition['JasperEncoder']['jasper'][-1]['filters'] logging.info(" Num encoder input features: {}".format(num_encoder_input_features)) logging.info(" Num decoder input features: {}".format(num_decoder_input_features)) nf = nemo.core.NeuralModuleFactory(create_tb_writer=False) logging.info("Initializing models...") jasper_encoder = nemo_asr.JasperEncoder( feat_in=num_encoder_input_features, **jasper_model_definition['JasperEncoder'] ) jasper_decoder = nemo_asr.JasperDecoderForCTC( feat_in=num_decoder_input_features, num_classes=len(jasper_model_definition['labels']), ) # This is necessary if you are using checkpoints trained with NeMo # version before 0.9 logging.info("Loading checkpoints...") if pre_v09_model: logging.info(" Converting pre v0.9 checkpoint...") ckpt = torch.load(nn_encoder) new_ckpt = {} for k, v in ckpt.items(): new_k = k.replace('.conv.', '.mconv.') if len(v.shape) == 3: new_k = new_k.replace('.weight', '.conv.weight') new_ckpt[new_k] = v jasper_encoder.load_state_dict(new_ckpt) else: jasper_encoder.restore_from(nn_encoder) jasper_decoder.restore_from(nn_decoder) logging.info("Exporting encoder...") nf.deployment_export( jasper_encoder, nn_onnx_encoder, nemo.core.neural_factory.DeploymentFormat.ONNX, torch.zeros(batch_size, num_encoder_input_features, time_steps, dtype=torch.float, device="cuda:0",), ) logging.info("Exporting decoder...") nf.deployment_export( jasper_decoder, nn_onnx_decoder, nemo.core.neural_factory.DeploymentFormat.ONNX, (torch.zeros(batch_size, num_decoder_input_features, time_steps // 2, dtype=torch.float, device="cuda:0",)), ) logging.info("Export completed successfully.")
def create_all_dags(args, neural_factory): yaml = YAML(typ="safe") with open(args.model_config) as f: jasper_params = yaml.load(f) vocab = load_vocab(args.vocab_file) sample_rate = jasper_params['sample_rate'] # Calculate num_workers for dataloader total_cpus = os.cpu_count() cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1) # perturb_config = jasper_params.get('perturb', None) train_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"]) train_dl_params.update(jasper_params["AudioToTextDataLayer"]["train"]) del train_dl_params["train"] del train_dl_params["eval"] train_dl_params["normalize_transcripts"] = False data_layer = nemo_asr.AudioToTextDataLayer( manifest_filepath=args.train_dataset, sample_rate=sample_rate, labels=vocab, batch_size=args.batch_size, num_workers=cpu_per_traindl, **train_dl_params, # normalize_transcripts=False ) N = len(data_layer) steps_per_epoch = int(N / (args.batch_size * args.num_gpus)) nemo.logging.info('Have {0} examples to train on.'.format(N)) data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( sample_rate=sample_rate, **jasper_params["AudioToMelSpectrogramPreprocessor"], ) multiply_batch_config = jasper_params.get('MultiplyBatch', None) if multiply_batch_config: multiply_batch = nemo_asr.MultiplyBatch(**multiply_batch_config) spectr_augment_config = jasper_params.get('SpectrogramAugmentation', None) if spectr_augment_config: data_spectr_augmentation = nemo_asr.SpectrogramAugmentation( **spectr_augment_config) eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"]) eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"]) eval_dl_params["normalize_transcripts"] = False del eval_dl_params["train"] del eval_dl_params["eval"] data_layers_eval = [] if args.eval_datasets: for eval_datasets in args.eval_datasets: data_layer_eval = nemo_asr.AudioToTextDataLayer( manifest_filepath=eval_datasets, sample_rate=sample_rate, labels=vocab, batch_size=args.eval_batch_size, num_workers=cpu_per_traindl, **eval_dl_params, ) data_layers_eval.append(data_layer_eval) else: nemo.logging.warning("There were no val datasets passed") jasper_encoder = nemo_asr.JasperEncoder( feat_in=jasper_params["AudioToMelSpectrogramPreprocessor"]["features"], **jasper_params["JasperEncoder"], ) jasper_decoder = nemo_asr.JasperDecoderForCTC( feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"], num_classes=len(vocab)) ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab)) greedy_decoder = nemo_asr.GreedyCTCDecoder() nemo.logging.info('================================') nemo.logging.info( f"Number of parameters in encoder: {jasper_encoder.num_weights}") nemo.logging.info( f"Number of parameters in decoder: {jasper_decoder.num_weights}") nemo.logging.info( f"Total number of parameters in model: " f"{jasper_decoder.num_weights + jasper_encoder.num_weights}") nemo.logging.info('================================') # Train DAG ( audio_signal_t, a_sig_length_t, transcript_t, transcript_len_t, ) = data_layer() processed_signal_t, p_length_t = data_preprocessor( input_signal=audio_signal_t, length=a_sig_length_t) if multiply_batch_config: ( processed_signal_t, p_length_t, transcript_t, transcript_len_t, ) = multiply_batch( in_x=processed_signal_t, in_x_len=p_length_t, in_y=transcript_t, in_y_len=transcript_len_t, ) if spectr_augment_config: processed_signal_t = data_spectr_augmentation( input_spec=processed_signal_t) encoded_t, encoded_len_t = jasper_encoder(audio_signal=processed_signal_t, length=p_length_t) log_probs_t = jasper_decoder(encoder_output=encoded_t) predictions_t = greedy_decoder(log_probs=log_probs_t) loss_t = ctc_loss( log_probs=log_probs_t, targets=transcript_t, input_length=encoded_len_t, target_length=transcript_len_t, ) # Callbacks needed to print info to console and Tensorboard train_callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss_t, predictions_t, transcript_t, transcript_len_t], print_func=partial(monitor_asr_train_progress, labels=vocab, eval_metric='CER'), step_freq=args.train_eval_freq, get_tb_values=lambda x: [("loss", x[0])], tb_writer=neural_factory.tb_writer, ) chpt_callback = nemo.core.CheckpointCallback( folder=neural_factory.checkpoint_dir, step_freq=args.checkpoint_save_freq, ) callbacks = [train_callback, chpt_callback] # assemble eval DAGs for i, eval_dl in enumerate(data_layers_eval): ( audio_signal_e, a_sig_length_e, transcript_e, transcript_len_e, ) = eval_dl() processed_signal_e, p_length_e = data_preprocessor( input_signal=audio_signal_e, length=a_sig_length_e) encoded_e, encoded_len_e = jasper_encoder( audio_signal=processed_signal_e, length=p_length_e) log_probs_e = jasper_decoder(encoder_output=encoded_e) predictions_e = greedy_decoder(log_probs=log_probs_e) loss_e = ctc_loss( log_probs=log_probs_e, targets=transcript_e, input_length=encoded_len_e, target_length=transcript_len_e, ) # create corresponding eval callback tagname = os.path.basename(args.eval_datasets[i]).split(".")[0] eval_callback = nemo.core.EvaluatorCallback( eval_tensors=[ loss_e, predictions_e, transcript_e, transcript_len_e, ], user_iter_callback=partial(process_evaluation_batch, labels=vocab), user_epochs_done_callback=partial(process_evaluation_epoch, eval_metric='CER', tag=tagname), eval_step=args.eval_freq, tb_writer=neural_factory.tb_writer, ) callbacks.append(eval_callback) return loss_t, callbacks, steps_per_epoch
def create_all_dags(args, neural_factory): """ creates train and eval dags as well as their callbacks returns train loss tensor and callbacks""" # parse the config files yaml = YAML(typ="safe") with open(args.model_config) as f: spkr_params = yaml.load(f) sample_rate = spkr_params["sample_rate"] time_length = spkr_params.get("time_length", 8) logging.info("max time length considered is {} sec".format(time_length)) # Calculate num_workers for dataloader total_cpus = os.cpu_count() cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1) // 2 # create data layer for training train_dl_params = copy.deepcopy(spkr_params["AudioToSpeechLabelDataLayer"]) train_dl_params.update(spkr_params["AudioToSpeechLabelDataLayer"]["train"]) del train_dl_params["train"] del train_dl_params["eval"] audio_augmentor = spkr_params.get("AudioAugmentor", None) # del train_dl_params["normalize_transcripts"] data_layer_train = nemo_asr.AudioToSpeechLabelDataLayer( manifest_filepath=args.train_dataset, labels=None, batch_size=args.batch_size, num_workers=cpu_per_traindl, augmentor=audio_augmentor, time_length=time_length, **train_dl_params, # normalize_transcripts=False ) N = len(data_layer_train) steps_per_epoch = int(N / (args.batch_size * args.iter_per_step * args.num_gpus)) logging.info("Number of steps per epoch {}".format(steps_per_epoch)) # create separate data layers for eval # we need separate eval dags for separate eval datasets # but all other modules in these dags will be shared eval_dl_params = copy.deepcopy(spkr_params["AudioToSpeechLabelDataLayer"]) eval_dl_params.update(spkr_params["AudioToSpeechLabelDataLayer"]["eval"]) del eval_dl_params["train"] del eval_dl_params["eval"] data_layers_test = [] for test_set in args.eval_datasets: data_layer_test = nemo_asr.AudioToSpeechLabelDataLayer( manifest_filepath=test_set, labels=data_layer_train.labels, batch_size=args.batch_size, num_workers=cpu_per_traindl, time_length=time_length, **eval_dl_params, # normalize_transcripts=False ) data_layers_test.append(data_layer_test) # create shared modules data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( sample_rate=sample_rate, **spkr_params["AudioToMelSpectrogramPreprocessor"], ) spectr_augment_config = spkr_params.get("SpectrogramAugmentation", None) if spectr_augment_config: data_spectr_augmentation = nemo_asr.SpectrogramAugmentation(**spectr_augment_config) # (QuartzNet uses the Jasper baseline encoder and decoder) encoder = nemo_asr.JasperEncoder(**spkr_params["JasperEncoder"],) decoder = nemo_asr.JasperDecoderForSpkrClass( feat_in=spkr_params["JasperEncoder"]["jasper"][-1]["filters"], num_classes=data_layer_train.num_classes, pool_mode=spkr_params["JasperDecoderForSpkrClass"]['pool_mode'], emb_sizes=spkr_params["JasperDecoderForSpkrClass"]["emb_sizes"].split(","), ) if os.path.exists(args.checkpoint_dir + "/JasperEncoder-STEP-100.pt"): encoder.restore_from(args.checkpoint_dir + "/JasperEncoder-STEP-100.pt") logging.info("Pretrained Encoder loaded") weight = None xent_loss = nemo_asr.CrossEntropyLossNM(weight=weight) # assemble train DAG audio_signal, audio_signal_len, label, label_len = data_layer_train() processed_signal, processed_signal_len = data_preprocessor(input_signal=audio_signal, length=audio_signal_len) if spectr_augment_config: processed_signal = data_spectr_augmentation(input_spec=processed_signal) encoded, encoded_len = encoder(audio_signal=processed_signal, length=processed_signal_len) logits, _ = decoder(encoder_output=encoded) loss = xent_loss(logits=logits, labels=label) # create train callbacks train_callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss, logits, label], print_func=partial(monitor_classification_training_progress, eval_metric=[1]), step_freq=args.print_freq, get_tb_values=lambda x: [("train_loss", x[0])], tb_writer=neural_factory.tb_writer, ) callbacks = [train_callback] if args.checkpoint_dir or args.load_dir: chpt_callback = nemo.core.CheckpointCallback( folder=args.checkpoint_dir, load_from_folder=args.checkpoint_dir, # load dir step_freq=args.checkpoint_save_freq, checkpoints_to_keep=125, ) callbacks.append(chpt_callback) # --- Assemble Validation DAG --- # for i, eval_layer in enumerate(data_layers_test): audio_signal_test, audio_len_test, label_test, _ = eval_layer() processed_signal_test, processed_len_test = data_preprocessor( input_signal=audio_signal_test, length=audio_len_test ) encoded_test, encoded_len_test = encoder(audio_signal=processed_signal_test, length=processed_len_test) logits_test, _ = decoder(encoder_output=encoded_test) loss_test = xent_loss(logits=logits_test, labels=label_test) tagname = os.path.dirname(args.eval_datasets[i]).split("/")[-1] + "_" + str(i) print(tagname) eval_callback = nemo.core.EvaluatorCallback( eval_tensors=[loss_test, logits_test, label_test], user_iter_callback=partial(process_classification_evaluation_batch, top_k=1), user_epochs_done_callback=partial(process_classification_evaluation_epoch, tag=tagname), eval_step=args.eval_freq, # How often we evaluate the model on the test set tb_writer=neural_factory.tb_writer, ) callbacks.append(eval_callback) return loss, callbacks, steps_per_epoch, loss_test, logits_test, label_test
def test_jasper_eval(self): with open( os.path.abspath( os.path.join(os.path.dirname(__file__), "../data/jasper_smaller.yaml"))) as file: jasper_model_definition = self.yaml.load(file) dl = nemo_asr.AudioToTextDataLayer( manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=4, ) pre_process_params = { 'frame_splicing': 1, 'features': 64, 'window_size': 0.02, 'n_fft': 512, 'dither': 1e-05, 'window': 'hann', 'sample_rate': 16000, 'normalize': 'per_feature', 'window_stride': 0.01, } preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor( **pre_process_params) jasper_encoder = nemo_asr.JasperEncoder( feat_in=jasper_model_definition[ 'AudioToMelSpectrogramPreprocessor']['features'], **jasper_model_definition['JasperEncoder'], ) jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len( self.labels)) ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels)) greedy_decoder = nemo_asr.GreedyCTCDecoder() # DAG audio_signal, a_sig_length, transcript, transcript_len = dl() processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length) encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length) # logging.info(jasper_encoder) log_probs = jasper_decoder(encoder_output=encoded) loss = ctc_loss( log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len, ) predictions = greedy_decoder(log_probs=log_probs) from nemo.collections.asr.helpers import ( process_evaluation_batch, process_evaluation_epoch, ) eval_callback = nemo.core.EvaluatorCallback( eval_tensors=[loss, predictions, transcript, transcript_len], user_iter_callback=lambda x, y: process_evaluation_batch( x, y, labels=self.labels), user_epochs_done_callback=process_evaluation_epoch, ) # Instantiate an optimizer to perform `train` action self.nf.eval(callbacks=[eval_callback])
def test_clas(self): with open('examples/asr/experimental/configs/garnet_an4.yaml') as file: cfg = self.yaml.load(file) dl = nemo_asr.AudioToTextDataLayer( featurizer_config=self.featurizer_config, manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=4, ) pre_process_params = { 'int_values': False, 'frame_splicing': 1, 'features': 64, 'window_size': 0.02, 'n_fft': 512, 'dither': 1e-05, 'window': 'hann', 'sample_rate': 16000, 'normalize': 'per_feature', 'window_stride': 0.01, 'stft_conv': True, } preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor( **pre_process_params) encoder = nemo_asr.JasperEncoder( jasper=cfg['encoder']['jasper'], activation=cfg['encoder']['activation'], feat_in=cfg['input']['train']['features'], ) connector = nemo_asr.JasperRNNConnector( in_channels=cfg['encoder']['jasper'][-1]['filters'], out_channels=cfg['decoder']['hidden_size'], ) decoder = nemo.backends.pytorch.common.DecoderRNN( voc_size=len(self.labels), bos_id=0, **cfg['decoder'] # fictive ) loss = nemo.backends.pytorch.common.SequenceLoss() # DAG audio_signal, a_sig_length, transcripts, transcript_len = dl() processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length) encoded, encoded_len = encoder(audio_signal=processed_signal, length=p_length) encoded = connector(tensor=encoded) log_probs, _ = decoder(targets=transcripts, encoder_outputs=encoded) loss = loss(log_probs=log_probs, targets=transcripts) # Train callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss], print_func=lambda x: print(str(x[0].item()))) # Instantiate an optimizer to perform `train` action neural_factory = nemo.core.NeuralModuleFactory( backend=nemo.core.Backend.PyTorch, local_rank=None, create_tb_writer=False, ) optimizer = neural_factory.get_trainer() optimizer.train( [loss], callbacks=[callback], optimizer="sgd", optimization_params={ "num_epochs": 10, "lr": 0.0003 }, )
def test_stft_conv_training(self): """Integtaion test that instantiates a small Jasper model and tests training with the sample asr data. test_stft_conv_training tests the torch_stft path while test_jasper_training tests the torch.stft path inside of AudioToMelSpectrogramPreprocessor. Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss at the first step. Note: Training is done with batch gradient descent as opposed to stochastic gradient descent due to CTC loss """ with open( os.path.abspath( os.path.join(os.path.dirname(__file__), "../data/jasper_smaller.yaml"))) as file: jasper_model_definition = self.yaml.load(file) dl = nemo_asr.AudioToTextDataLayer( manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=30) pre_process_params = { 'frame_splicing': 1, 'features': 64, 'window_size': 0.02, 'n_fft': 512, 'dither': 1e-05, 'window': 'hann', 'sample_rate': 16000, 'normalize': 'per_feature', 'window_stride': 0.01, 'stft_conv': True, } preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor( **pre_process_params) jasper_encoder = nemo_asr.JasperEncoder( feat_in=jasper_model_definition[ 'AudioToMelSpectrogramPreprocessor']['features'], **jasper_model_definition['JasperEncoder'], ) jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len( self.labels)) ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels)) # DAG audio_signal, a_sig_length, transcript, transcript_len = dl() processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length) encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length) # logging.info(jasper_encoder) log_probs = jasper_decoder(encoder_output=encoded) loss = ctc_loss( log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len, ) loss_list = [] callback = SimpleLossLoggerCallback(tensors=[loss], print_func=partial( self.print_and_log_loss, loss_log_list=loss_list), step_freq=1) self.nf.train( [loss], callbacks=[callback], optimizer="sgd", optimization_params={ "max_steps": 3, "lr": 0.001 }, ) self.nf.reset_trainer() # Assert that training loss went down assert loss_list[-1] < loss_list[0]
def test_clas(self): with open('examples/asr/experimental/configs/garnet_an4.yaml') as file: cfg = self.yaml.load(file) dl = nemo_asr.AudioToTextDataLayer( manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=4, ) pre_process_params = { 'frame_splicing': 1, 'features': 64, 'window_size': 0.02, 'n_fft': 512, 'dither': 1e-05, 'window': 'hann', 'sample_rate': 16000, 'normalize': 'per_feature', 'window_stride': 0.01, 'stft_conv': True, } preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor( **pre_process_params) encoder = nemo_asr.JasperEncoder( jasper=cfg['encoder']['jasper'], activation=cfg['encoder']['activation'], feat_in=cfg['input']['train']['features'], ) connector = nemo_asr.JasperRNNConnector( in_channels=cfg['encoder']['jasper'][-1]['filters'], out_channels=cfg['decoder']['hidden_size'], ) decoder = nemo.backends.pytorch.common.DecoderRNN( voc_size=len(self.labels), bos_id=0, hidden_size=cfg['decoder']['hidden_size'], attention_method=cfg['decoder']['attention_method'], attention_type=cfg['decoder']['attention_type'], in_dropout=cfg['decoder']['in_dropout'], gru_dropout=cfg['decoder']['gru_dropout'], attn_dropout=cfg['decoder']['attn_dropout'], teacher_forcing=cfg['decoder']['teacher_forcing'], curriculum_learning=cfg['decoder']['curriculum_learning'], rnn_type=cfg['decoder']['rnn_type'], n_layers=cfg['decoder']['n_layers'], tie_emb_out_weights=cfg['decoder']['tie_emb_out_weights'], ) loss = nemo.backends.pytorch.common.SequenceLoss() # DAG audio_signal, a_sig_length, transcripts, transcript_len = dl() processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length) encoded, encoded_len = encoder(audio_signal=processed_signal, length=p_length) encoded = connector(tensor=encoded) log_probs, _ = decoder(targets=transcripts, encoder_outputs=encoded) loss = loss(log_probs=log_probs, targets=transcripts) # Train callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss], print_func=lambda x: logging.info(str(x[0].item()))) # Instantiate an optimizer to perform `train` action optimizer = self.nf.get_trainer() optimizer.train( [loss], callbacks=[callback], optimizer="sgd", optimization_params={ "num_epochs": 10, "lr": 0.0003 }, )
def create_dags(jasper_params, args, nf): vocab = jasper_params['labels'] # build train and eval model train_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"]) train_dl_params.update(jasper_params["AudioToTextDataLayer"]["train"]) del train_dl_params["train"] del train_dl_params["eval"] data_layer = nemo_asr.AudioToTextDataLayer( manifest_filepath=args.train_dataset, labels=vocab, batch_size=args.batch_size, **train_dl_params, ) num_samples = len(data_layer) steps_per_epoch = math.ceil( num_samples / (args.batch_size * args.iter_per_step * nf.world_size)) total_steps = steps_per_epoch * args.num_epochs logging.info("Train samples=", num_samples, "num_steps=", total_steps) data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( **jasper_params["AudioToMelSpectrogramPreprocessor"]) # data_augmentation = nemo_asr.SpectrogramAugmentation( # **jasper_params['SpectrogramAugmentation'] # ) eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"]) eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"]) del eval_dl_params["train"] del eval_dl_params["eval"] data_layer_eval = nemo_asr.AudioToTextDataLayer( manifest_filepath=args.eval_datasets, labels=vocab, batch_size=args.eval_batch_size, **eval_dl_params, ) num_samples = len(data_layer_eval) logging.info(f"Eval samples={num_samples}") jasper_encoder = nemo_asr.JasperEncoder(**jasper_params["JasperEncoder"]) jasper_decoder = nemo_asr.JasperDecoderForCTC( num_classes=len(vocab), **jasper_params["JasperDecoderForCTC"]) ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab)) greedy_decoder = nemo_asr.GreedyCTCDecoder() # Training model audio, audio_len, transcript, transcript_len = data_layer() processed, processed_len = data_preprocessor(input_signal=audio, length=audio_len) encoded, encoded_len = jasper_encoder(audio_signal=processed, length=processed_len) log_probs = jasper_decoder(encoder_output=encoded) predictions = greedy_decoder(log_probs=log_probs) loss = ctc_loss( log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len, ) # Evaluation model audio_e, audio_len_e, transcript_e, transcript_len_e = data_layer_eval() processed_e, processed_len_e = data_preprocessor(input_signal=audio_e, length=audio_len_e) encoded_e, encoded_len_e = jasper_encoder(audio_signal=processed_e, length=processed_len_e) log_probs_e = jasper_decoder(encoder_output=encoded_e) predictions_e = greedy_decoder(log_probs=log_probs_e) loss_e = ctc_loss( log_probs=log_probs_e, targets=transcript_e, input_length=encoded_len_e, target_length=transcript_len_e, ) logging.info("Num of params in encoder: {0}".format( jasper_encoder.num_weights)) # Callbacks to print info to console and Tensorboard train_callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss, predictions, transcript, transcript_len], print_func=partial(monitor_asr_train_progress, labels=vocab), get_tb_values=lambda x: [["loss", x[0]]], tb_writer=nf.tb_writer, ) checkpointer_callback = nemo.core.CheckpointCallback( folder=nf.checkpoint_dir, step_freq=args.checkpoint_save_freq) eval_tensors = [loss_e, predictions_e, transcript_e, transcript_len_e] eval_callback = nemo.core.EvaluatorCallback( eval_tensors=eval_tensors, user_iter_callback=partial(process_evaluation_batch, labels=vocab), user_epochs_done_callback=process_evaluation_epoch, eval_step=args.eval_freq, tb_writer=nf.tb_writer, ) callbacks = [train_callback, checkpointer_callback, eval_callback] return ( loss, eval_tensors, callbacks, total_steps, vocab, log_probs_e, encoded_len_e, )
sample_rate=sample_rate, **jasper_params["AudioToMelSpectrogramPreprocessor"], ) if args.model == 'Res8': encoder = Res8(args.hidden_size).to('cuda') encoder.restore_from('./runs/{0}/checkpoints/Res8-STEP-{1}.pt'.format( args.name, str(args.enc_step))) encoder.freeze() elif args.model == 'Res15': encoder = Res15(args.hidden_size).to('cuda') encoder.restore_from('./runs/{0}/checkpoints/Res15-STEP-{1}.pt'.format( args.name, str(args.enc_step))) encoder.freeze() elif args.model == 'Quartz': encoder = nemo_asr.JasperEncoder(**jasper_params["JasperEncoder"]) fc = LinearLayer(64 * 256) # TODO find shape from jasper_params fc.restore_from('./runs/{0}/checkpoints/LinearLayer-STEP-{1}.pt'.format( args.name, str(args.enc_step))) encoder.restore_from( './runs/{0}/checkpoints/JasperEncoder-STEP-{1}.pt'.format( args.name, str(args.enc_step))) encoder.freeze() l2_regularizer = L2Regularizer() N = len(train_data_layer) steps_per_epoch = math.ceil(N / float(batch_size) + 1) """BUILDING TRAIN GRAPH""" audio_signal, audio_signal_len, commands, command_len = train_data_layer()
def create_all_dags(args, neural_factory): ''' creates train and eval dags as well as their callbacks returns train loss tensor and callbacks''' # parse the config files yaml = YAML(typ="safe") with open(args.model_config) as f: spkr_params = yaml.load(f) sample_rate = spkr_params['sample_rate'] # Calculate num_workers for dataloader total_cpus = os.cpu_count() cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1) # create separate data layers for eval # we need separate eval dags for separate eval datasets # but all other modules in these dags will be shared eval_dl_params = copy.deepcopy(spkr_params["AudioToSpeechLabelDataLayer"]) eval_dl_params.update(spkr_params["AudioToSpeechLabelDataLayer"]["eval"]) del eval_dl_params["train"] del eval_dl_params["eval"] eval_dl_params[ 'shuffle'] = False # To grab the file names without changing data_layer data_layer_test = nemo_asr.AudioToSpeechLabelDataLayer( manifest_filepath=args.eval_datasets[0], labels=None, batch_size=args.batch_size, num_workers=cpu_per_traindl, **eval_dl_params, # normalize_transcripts=False ) # create shared modules data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( sample_rate=sample_rate, **spkr_params["AudioToMelSpectrogramPreprocessor"], ) # (QuartzNet uses the Jasper baseline encoder and decoder) encoder = nemo_asr.JasperEncoder(**spkr_params["JasperEncoder"], ) decoder = nemo_asr.JasperDecoderForSpkrClass( feat_in=spkr_params['JasperEncoder']['jasper'][-1]['filters'], num_classes=254, emb_sizes=spkr_params['JasperDecoderForSpkrClass']['emb_sizes'].split( ','), pool_mode=spkr_params["JasperDecoderForSpkrClass"]['pool_mode'], ) # --- Assemble Validation DAG --- # audio_signal_test, audio_len_test, label_test, _ = data_layer_test() processed_signal_test, processed_len_test = data_preprocessor( input_signal=audio_signal_test, length=audio_len_test) encoded_test, _ = encoder(audio_signal=processed_signal_test, length=processed_len_test) _, embeddings = decoder(encoder_output=encoded_test) return embeddings, label_test
def create_all_dags(args, neural_factory): yaml = YAML(typ="safe") with open(args.model_config) as f: jasper_params = yaml.load(f) labels = jasper_params['labels'] # Vocab of tokens sample_rate = jasper_params['sample_rate'] # Calculate num_workers for dataloader total_cpus = os.cpu_count() cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1) # perturb_config = jasper_params.get('perturb', None) train_dl_params = copy.deepcopy( jasper_params["AudioToSpeechLabelDataLayer"]) train_dl_params.update( jasper_params["AudioToSpeechLabelDataLayer"]["train"]) del train_dl_params["train"] del train_dl_params["eval"] # del train_dl_params["normalize_transcripts"] # Look for augmentations audio_augmentor = jasper_params.get('AudioAugmentor', None) data_layer = nemo_asr.AudioToSpeechLabelDataLayer( manifest_filepath=args.train_dataset, labels=labels, sample_rate=sample_rate, batch_size=args.batch_size, num_workers=cpu_per_traindl, augmentor=audio_augmentor, **train_dl_params, ) crop_pad_augmentation = nemo_asr.CropOrPadSpectrogramAugmentation( audio_length=128) N = len(data_layer) steps_per_epoch = math.ceil( N / (args.batch_size * args.iter_per_step * args.num_gpus)) logging.info('Steps per epoch : {0}'.format(steps_per_epoch)) logging.info('Have {0} examples to train on.'.format(N)) data_preprocessor = nemo_asr.AudioToMFCCPreprocessor( sample_rate=sample_rate, **jasper_params["AudioToMFCCPreprocessor"], ) spectr_augment_config = jasper_params.get('SpectrogramAugmentation', None) if spectr_augment_config: data_spectr_augmentation = nemo_asr.SpectrogramAugmentation( **spectr_augment_config) eval_dl_params = copy.deepcopy( jasper_params["AudioToSpeechLabelDataLayer"]) eval_dl_params.update(jasper_params["AudioToSpeechLabelDataLayer"]["eval"]) del eval_dl_params["train"] del eval_dl_params["eval"] data_layers_eval = [] if args.eval_datasets: for eval_datasets in args.eval_datasets: data_layer_eval = nemo_asr.AudioToSpeechLabelDataLayer( manifest_filepath=eval_datasets, sample_rate=sample_rate, labels=labels, batch_size=args.eval_batch_size, num_workers=cpu_per_traindl, **eval_dl_params, ) data_layers_eval.append(data_layer_eval) else: logging.warning("There were no val datasets passed") jasper_encoder = nemo_asr.JasperEncoder(**jasper_params["JasperEncoder"], ) jasper_decoder = nemo_asr.JasperDecoderForClassification( feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"], num_classes=len(labels), **jasper_params['JasperDecoderForClassification'], ) ce_loss = nemo_asr.CrossEntropyLossNM() logging.info('================================') logging.info( f"Number of parameters in encoder: {jasper_encoder.num_weights}") logging.info( f"Number of parameters in decoder: {jasper_decoder.num_weights}") logging.info(f"Total number of parameters in model: " f"{jasper_decoder.num_weights + jasper_encoder.num_weights}") logging.info('================================') # Train DAG # --- Assemble Training DAG --- # audio_signal, audio_signal_len, commands, command_len = data_layer() processed_signal, processed_signal_len = data_preprocessor( input_signal=audio_signal, length=audio_signal_len) processed_signal, processed_signal_len = crop_pad_augmentation( input_signal=processed_signal, length=audio_signal_len) if spectr_augment_config: processed_signal = data_spectr_augmentation( input_spec=processed_signal) encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=processed_signal_len) decoded = jasper_decoder(encoder_output=encoded) loss = ce_loss(logits=decoded, labels=commands) # Callbacks needed to print info to console and Tensorboard train_callback = nemo.core.SimpleLossLoggerCallback( # Notice that we pass in loss, predictions, and the labels (commands). # Of course we would like to see our training loss, but we need the # other arguments to calculate the accuracy. tensors=[loss, decoded, commands], # The print_func defines what gets printed. print_func=partial(monitor_classification_training_progress, eval_metric=None), get_tb_values=lambda x: [("loss", x[0])], tb_writer=neural_factory.tb_writer, ) chpt_callback = nemo.core.CheckpointCallback( folder=neural_factory.checkpoint_dir, load_from_folder=args.load_dir, step_freq=args.checkpoint_save_freq, ) callbacks = [train_callback, chpt_callback] # assemble eval DAGs for i, eval_dl in enumerate(data_layers_eval): # --- Assemble Training DAG --- # test_audio_signal, test_audio_signal_len, test_commands, test_command_len = eval_dl( ) test_processed_signal, test_processed_signal_len = data_preprocessor( input_signal=test_audio_signal, length=test_audio_signal_len) test_processed_signal, test_processed_signal_len = crop_pad_augmentation( input_signal=test_processed_signal, length=test_processed_signal_len) test_encoded, test_encoded_len = jasper_encoder( audio_signal=test_processed_signal, length=test_processed_signal_len) test_decoded = jasper_decoder(encoder_output=test_encoded) test_loss = ce_loss(logits=test_decoded, labels=test_commands) # create corresponding eval callback tagname = os.path.basename(args.eval_datasets[i]).split(".")[0] eval_callback = nemo.core.EvaluatorCallback( eval_tensors=[test_loss, test_decoded, test_commands], user_iter_callback=partial(process_classification_evaluation_batch, top_k=1), user_epochs_done_callback=partial( process_classification_evaluation_epoch, eval_metric=1, tag=tagname), eval_step=args. eval_freq, # How often we evaluate the model on the test set tb_writer=neural_factory.tb_writer, ) callbacks.append(eval_callback) return loss, callbacks, steps_per_epoch
def create_all_dags(args, neural_factory): ''' creates train and eval dags as well as their callbacks returns train loss tensor and callbacks''' # parse the config files yaml = YAML(typ="safe") with open(args.model_config) as f: quartz_params = yaml.load(f) try: vocab = quartz_params['labels'] sample_rate = quartz_params['sample_rate'] except KeyError: logging.error("Please make sure you are using older config format (the ones with -old suffix)") exit(1) # Calculate num_workers for dataloader total_cpus = os.cpu_count() cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1) # create data layer for training train_dl_params = copy.deepcopy(quartz_params["AudioToTextDataLayer"]) train_dl_params.update(quartz_params["AudioToTextDataLayer"]["train"]) del train_dl_params["train"] del train_dl_params["eval"] # del train_dl_params["normalize_transcripts"] data_layer_train = nemo_asr.AudioToTextDataLayer( manifest_filepath=args.train_dataset, sample_rate=sample_rate, labels=vocab, batch_size=args.batch_size, num_workers=cpu_per_traindl, **train_dl_params, # normalize_transcripts=False ) N = len(data_layer_train) steps_per_epoch = int(N / (args.batch_size * args.iter_per_step * args.num_gpus)) # create separate data layers for eval # we need separate eval dags for separate eval datasets # but all other modules in these dags will be shared eval_dl_params = copy.deepcopy(quartz_params["AudioToTextDataLayer"]) eval_dl_params.update(quartz_params["AudioToTextDataLayer"]["eval"]) del eval_dl_params["train"] del eval_dl_params["eval"] data_layers_eval = [] if args.eval_datasets: for eval_dataset in args.eval_datasets: data_layer_eval = nemo_asr.AudioToTextDataLayer( manifest_filepath=eval_dataset, sample_rate=sample_rate, labels=vocab, batch_size=args.eval_batch_size, num_workers=cpu_per_traindl, **eval_dl_params, ) data_layers_eval.append(data_layer_eval) else: logging.warning("There were no val datasets passed") # create shared modules data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( sample_rate=sample_rate, **quartz_params["AudioToMelSpectrogramPreprocessor"], ) # (QuartzNet uses the Jasper baseline encoder and decoder) encoder = nemo_asr.JasperEncoder( feat_in=quartz_params["AudioToMelSpectrogramPreprocessor"]["features"], **quartz_params["JasperEncoder"], ) decoder = nemo_asr.JasperDecoderForCTC( feat_in=quartz_params["JasperEncoder"]["jasper"][-1]["filters"], num_classes=len(vocab), ) ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab)) greedy_decoder = nemo_asr.GreedyCTCDecoder() # create augmentation modules (only used for training) if their configs # are present multiply_batch_config = quartz_params.get('MultiplyBatch', None) if multiply_batch_config: multiply_batch = nemo_asr.MultiplyBatch(**multiply_batch_config) spectr_augment_config = quartz_params.get('SpectrogramAugmentation', None) if spectr_augment_config: data_spectr_augmentation = nemo_asr.SpectrogramAugmentation(**spectr_augment_config) # assemble train DAG (audio_signal_t, a_sig_length_t, transcript_t, transcript_len_t,) = data_layer_train() processed_signal_t, p_length_t = data_preprocessor(input_signal=audio_signal_t, length=a_sig_length_t) if multiply_batch_config: (processed_signal_t, p_length_t, transcript_t, transcript_len_t,) = multiply_batch( in_x=processed_signal_t, in_x_len=p_length_t, in_y=transcript_t, in_y_len=transcript_len_t, ) if spectr_augment_config: processed_signal_t = data_spectr_augmentation(input_spec=processed_signal_t) encoded_t, encoded_len_t = encoder(audio_signal=processed_signal_t, length=p_length_t) log_probs_t = decoder(encoder_output=encoded_t) predictions_t = greedy_decoder(log_probs=log_probs_t) loss_t = ctc_loss( log_probs=log_probs_t, targets=transcript_t, input_length=encoded_len_t, target_length=transcript_len_t, ) # create train callbacks train_callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss_t, predictions_t, transcript_t, transcript_len_t], print_func=partial(monitor_asr_train_progress, labels=vocab), get_tb_values=lambda x: [["loss", x[0]]], tb_writer=neural_factory.tb_writer, ) callbacks = [train_callback] if args.checkpoint_dir or args.load_dir: chpt_callback = nemo.core.CheckpointCallback( folder=args.checkpoint_dir, load_from_folder=args.load_dir, step_freq=args.checkpoint_save_freq, ) callbacks.append(chpt_callback) # assemble eval DAGs for i, eval_dl in enumerate(data_layers_eval): (audio_signal_e, a_sig_length_e, transcript_e, transcript_len_e,) = eval_dl() processed_signal_e, p_length_e = data_preprocessor(input_signal=audio_signal_e, length=a_sig_length_e) encoded_e, encoded_len_e = encoder(audio_signal=processed_signal_e, length=p_length_e) log_probs_e = decoder(encoder_output=encoded_e) predictions_e = greedy_decoder(log_probs=log_probs_e) loss_e = ctc_loss( log_probs=log_probs_e, targets=transcript_e, input_length=encoded_len_e, target_length=transcript_len_e, ) # create corresponding eval callback tagname = os.path.basename(args.eval_datasets[i]).split(".")[0] eval_callback = nemo.core.EvaluatorCallback( eval_tensors=[loss_e, predictions_e, transcript_e, transcript_len_e,], user_iter_callback=partial(process_evaluation_batch, labels=vocab), user_epochs_done_callback=partial(process_evaluation_epoch, tag=tagname), eval_step=args.eval_freq, tb_writer=neural_factory.tb_writer, ) callbacks.append(eval_callback) return loss_t, callbacks, steps_per_epoch
def main(): parser = argparse.ArgumentParser(description='Jasper') parser.add_argument("--local_rank", default=None, type=int) parser.add_argument("--batch_size", default=32, type=int) parser.add_argument("--model_config", type=str, required=True) parser.add_argument("--eval_datasets", type=str, required=True) parser.add_argument("--load_dir", type=str, required=True) parser.add_argument("--vocab_file", type=str, required=True) parser.add_argument("--save_logprob", default=None, type=str) parser.add_argument("--lm_path", default=None, type=str) parser.add_argument("--beam_width", default=50, type=int) parser.add_argument("--alpha", default=2.0, type=float) parser.add_argument("--beta", default=1.0, type=float) parser.add_argument("--cutoff_prob", default=0.99, type=float) parser.add_argument("--cutoff_top_n", default=40, type=int) args = parser.parse_args() batch_size = args.batch_size load_dir = args.load_dir if args.local_rank is not None: if args.lm_path: raise NotImplementedError( "Beam search decoder with LM does not currently support evaluation on multi-gpu." ) device = nemo.core.DeviceType.AllGpu else: device = nemo.core.DeviceType.GPU # Instantiate Neural Factory with supported backend neural_factory = nemo.core.NeuralModuleFactory( backend=nemo.core.Backend.PyTorch, local_rank=args.local_rank, optimization_level=nemo.core.Optimization.mxprO1, placement=device, ) if args.local_rank is not None: logging.info('Doing ALL GPU') yaml = YAML(typ="safe") with open(args.model_config) as f: jasper_params = yaml.load(f) vocab = load_vocab(args.vocab_file) sample_rate = jasper_params['sample_rate'] eval_datasets = args.eval_datasets eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"]) eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"]) eval_dl_params["normalize_transcripts"] = False del eval_dl_params["train"] del eval_dl_params["eval"] data_layer = nemo_asr.AudioToTextDataLayer( manifest_filepath=eval_datasets, sample_rate=sample_rate, labels=vocab, batch_size=batch_size, **eval_dl_params, ) n = len(data_layer) logging.info('Evaluating {0} examples'.format(n)) data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( sample_rate=sample_rate, **jasper_params["AudioToMelSpectrogramPreprocessor"], ) jasper_encoder = nemo_asr.JasperEncoder( feat_in=jasper_params["AudioToMelSpectrogramPreprocessor"]["features"], **jasper_params["JasperEncoder"], ) jasper_decoder = nemo_asr.JasperDecoderForCTC( feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"], num_classes=len(vocab), ) greedy_decoder = nemo_asr.GreedyCTCDecoder() if args.lm_path: beam_width = args.beam_width alpha = args.alpha beta = args.beta cutoff_prob = args.cutoff_prob cutoff_top_n = args.cutoff_top_n beam_search_with_lm = nemo_asr.BeamSearchDecoderWithLM( vocab=vocab, beam_width=beam_width, alpha=alpha, beta=beta, cutoff_prob=cutoff_prob, cutoff_top_n=cutoff_top_n, lm_path=args.lm_path, num_cpus=max(os.cpu_count(), 1), ) logging.info('================================') logging.info( f"Number of parameters in encoder: {jasper_encoder.num_weights}") logging.info( f"Number of parameters in decoder: {jasper_decoder.num_weights}") logging.info(f"Total number of parameters in model: " f"{jasper_decoder.num_weights + jasper_encoder.num_weights}") logging.info('================================') ( audio_signal_e1, a_sig_length_e1, transcript_e1, transcript_len_e1, ) = data_layer() processed_signal_e1, p_length_e1 = data_preprocessor( input_signal=audio_signal_e1, length=a_sig_length_e1) encoded_e1, encoded_len_e1 = jasper_encoder( audio_signal=processed_signal_e1, length=p_length_e1) log_probs_e1 = jasper_decoder(encoder_output=encoded_e1) predictions_e1 = greedy_decoder(log_probs=log_probs_e1) eval_tensors = [ log_probs_e1, predictions_e1, transcript_e1, transcript_len_e1, encoded_len_e1, ] if args.lm_path: beam_predictions_e1 = beam_search_with_lm( log_probs=log_probs_e1, log_probs_length=encoded_len_e1) eval_tensors.append(beam_predictions_e1) evaluated_tensors = neural_factory.infer( tensors=eval_tensors, checkpoint_dir=load_dir, ) greedy_hypotheses = post_process_predictions(evaluated_tensors[1], vocab) references = post_process_transcripts(evaluated_tensors[2], evaluated_tensors[3], vocab) cer = word_error_rate(hypotheses=greedy_hypotheses, references=references, use_cer=True) logging.info("Greedy CER {:.2f}%".format(cer * 100)) if args.lm_path: beam_hypotheses = [] # Over mini-batch for i in evaluated_tensors[-1]: # Over samples for j in i: beam_hypotheses.append(j[0][1]) cer = word_error_rate(hypotheses=beam_hypotheses, references=references, use_cer=True) logging.info("Beam CER {:.2f}".format(cer * 100)) if args.save_logprob: # Convert logits to list of numpy arrays logprob = [] for i, batch in enumerate(evaluated_tensors[0]): for j in range(batch.shape[0]): logprob.append( batch[j][:evaluated_tensors[4][i][j], :].cpu().numpy()) with open(args.save_logprob, 'wb') as f: pickle.dump(logprob, f, protocol=pickle.HIGHEST_PROTOCOL)
def create_dag(args, cfg, logger, num_gpus): # Defining nodes data = nemo_asr.AudioToTextDataLayer( manifest_filepath=args.train_dataset, labels=cfg['target']['labels'], batch_size=cfg['optimization']['batch_size'], eos_id=cfg['target']['eos_id'], **cfg['AudioToTextDataLayer']['train'], ) data_evals = [] if args.eval_datasets: for val_path in args.eval_datasets: data_evals.append( nemo_asr.AudioToTextDataLayer( manifest_filepath=val_path, labels=cfg['target']['labels'], batch_size=cfg['inference']['batch_size'], eos_id=cfg['target']['eos_id'], **cfg['AudioToTextDataLayer']['eval'], )) else: logger.info("There were no val datasets passed") data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( **cfg['AudioToMelSpectrogramPreprocessor']) data_augmentation = nemo_asr.SpectrogramAugmentation( **cfg['SpectrogramAugmentation']) encoder = nemo_asr.JasperEncoder( feat_in=cfg["AudioToMelSpectrogramPreprocessor"]["features"], **cfg['JasperEncoder'], ) if args.encoder_checkpoint is not None and os.path.exists( args.encoder_checkpoint): if cfg['JasperEncoder']['load']: encoder.restore_from(args.encoder_checkpoint, args.local_rank) logger.info(f'Loaded weights for encoder' f' from {args.encoder_checkpoint}') if cfg['JasperEncoder']['freeze']: encoder.freeze() logger.info(f'Freeze encoder weights') connector = nemo_asr.JasperRNNConnector( in_channels=cfg['JasperEncoder']['jasper'][-1]['filters'], out_channels=cfg['DecoderRNN']['hidden_size'], ) decoder = nemo.backends.pytorch.DecoderRNN( voc_size=len(cfg['target']['labels']), bos_id=cfg['target']['bos_id'], **cfg['DecoderRNN'], ) if args.decoder_checkpoint is not None and os.path.exists( args.decoder_checkpoint): if cfg['DecoderRNN']['load']: decoder.restore_from(args.decoder_checkpoint, args.local_rank) logger.info(f'Loaded weights for decoder' f' from {args.decoder_checkpoint}') if cfg['DecoderRNN']['freeze']: decoder.freeze() logger.info(f'Freeze decoder weights') if cfg['decoder']['unfreeze_attn']: for name, param in decoder.attention.named_parameters(): param.requires_grad = True logger.info(f'Unfreeze decoder attn weights') num_data = len(data) batch_size = cfg['optimization']['batch_size'] num_epochs = cfg['optimization']['params']['num_epochs'] steps_per_epoch = int(num_data / (batch_size * num_gpus)) total_steps = num_epochs * steps_per_epoch vsc = ValueSetterCallback tf_callback = ValueSetterCallback( decoder, 'teacher_forcing', policies=[vsc.Policy(vsc.Method.Const(1.0), start=0.0, end=1.0)], total_steps=total_steps, ) seq_loss = nemo.backends.pytorch.SequenceLoss( pad_id=cfg['target']['pad_id'], smoothing_coef=cfg['optimization']['smoothing_coef'], sample_wise=cfg['optimization']['sample_wise'], ) se_callback = ValueSetterCallback( seq_loss, 'smoothing_coef', policies=[ vsc.Policy(vsc.Method.Const(seq_loss.smoothing_coef), start=0.0, end=1.0), ], total_steps=total_steps, ) beam_search = nemo.backends.pytorch.BeamSearch( decoder=decoder, pad_id=cfg['target']['pad_id'], bos_id=cfg['target']['bos_id'], eos_id=cfg['target']['eos_id'], max_len=cfg['target']['max_len'], beam_size=cfg['inference']['beam_size'], ) uf_callback = UnfreezeCallback( [encoder, decoder], start_epoch=cfg['optimization']['start_unfreeze']) saver_callback = nemo.core.ModuleSaverCallback( save_modules_list=[encoder, connector, decoder], folder=args.checkpoint_dir, step_freq=args.eval_freq, ) # Creating DAG audios, audio_lens, transcripts, _ = data() processed_audios, processed_audio_lens = data_preprocessor( input_signal=audios, length=audio_lens) augmented_spec = data_augmentation(input_spec=processed_audios) encoded, _ = encoder(audio_signal=augmented_spec, length=processed_audio_lens) encoded = connector(tensor=encoded) log_probs, _ = decoder(targets=transcripts, encoder_outputs=encoded) train_loss = seq_loss(log_probs=log_probs, targets=transcripts) evals = [] for i, data_eval in enumerate(data_evals): audios, audio_lens, transcripts, _ = data_eval() processed_audios, processed_audio_lens = data_preprocessor( input_signal=audios, length=audio_lens) encoded, _ = encoder(audio_signal=processed_audios, length=processed_audio_lens) encoded = connector(tensor=encoded) log_probs, _ = decoder(targets=transcripts, encoder_outputs=encoded) loss = seq_loss(log_probs=log_probs, targets=transcripts) predictions, aw = beam_search(encoder_outputs=encoded) evals.append(( args.eval_datasets[i], (loss, log_probs, transcripts, predictions, aw), )) # Update config cfg['num_params'] = { 'encoder': encoder.num_weights, 'connector': connector.num_weights, 'decoder': decoder.num_weights, } cfg['num_params']['total'] = sum(cfg['num_params'].values()) cfg['input']['train'] = {'num_data': num_data} cfg['optimization']['steps_per_epoch'] = steps_per_epoch cfg['optimization']['total_steps'] = total_steps return ( (train_loss, evals), cfg, [tf_callback, se_callback, uf_callback, saver_callback], )
def main( config_file, nn_encoder, nn_decoder, nn_onnx_encoder, nn_onnx_decoder, pre_v09_model=False, batch_size=1, time_steps=256, decoder_type='ctc', ): yaml = YAML(typ="safe") logging.info("Loading config file...") with open(config_file) as f: jasper_model_definition = yaml.load(f) logging.info("Determining model shape...") num_encoder_input_features = 64 decoder_params = jasper_model_definition['init_params']['decoder_params']['init_params'] num_decoder_input_features = decoder_params['feat_in'] logging.info(" Num encoder input features: {}".format(num_encoder_input_features)) logging.info(" Num decoder input features: {}".format(num_decoder_input_features)) nf = nemo.core.NeuralModuleFactory(create_tb_writer=False) logging.info("Initializing models...") jasper_encoder = nemo_asr.JasperEncoder(**jasper_model_definition['init_params']['encoder_params']['init_params']) if decoder_type == 'ctc': jasper_decoder = nemo_asr.JasperDecoderForCTC( feat_in=num_decoder_input_features, num_classes=decoder_params['num_classes'], vocabulary=decoder_params['vocabulary'], ) elif decoder_type == 'classification': if 'labels' in jasper_model_definition: num_classes = len(jasper_model_definition['labels']) else: raise ValueError("List of class labels must be defined in model config file with key 'labels'") jasper_decoder = nemo_asr.JasperDecoderForClassification( feat_in=num_decoder_input_features, num_classes=num_classes ) else: raise ValueError("`decoder_type` must be one of ['ctc', 'classification']") # This is necessary if you are using checkpoints trained with NeMo # version before 0.9 logging.info("Loading checkpoints...") if pre_v09_model: logging.info(" Converting pre v0.9 checkpoint...") ckpt = torch.load(nn_encoder) new_ckpt = {} for k, v in ckpt.items(): new_k = k.replace('.conv.', '.mconv.') if len(v.shape) == 3: new_k = new_k.replace('.weight', '.conv.weight') new_ckpt[new_k] = v jasper_encoder.load_state_dict(new_ckpt) else: jasper_encoder.restore_from(nn_encoder) jasper_decoder.restore_from(nn_decoder) # Create export directories if they don't already exist base_export_dir, export_fn = os.path.split(nn_onnx_encoder) if base_export_dir and not os.path.exists(base_export_dir): os.makedirs(base_export_dir) base_export_dir, export_fn = os.path.split(nn_onnx_decoder) if base_export_dir and not os.path.exists(base_export_dir): os.makedirs(base_export_dir) logging.info("Exporting encoder...") nf.deployment_export( jasper_encoder, nn_onnx_encoder, nemo.core.neural_factory.DeploymentFormat.ONNX, torch.zeros(batch_size, num_encoder_input_features, time_steps, dtype=torch.float, device="cuda:0",), ) del jasper_encoder logging.info("Exporting decoder...") nf.deployment_export( jasper_decoder, nn_onnx_decoder, nemo.core.neural_factory.DeploymentFormat.ONNX, (torch.zeros(batch_size, num_decoder_input_features, time_steps // 2, dtype=torch.float, device="cuda:0",)), ) del jasper_decoder logging.info("Export completed successfully.")
def test_stft_conv(self): with open( os.path.abspath( os.path.join(os.path.dirname(__file__), "../data/jasper_smaller.yaml"))) as file: jasper_model_definition = self.yaml.load(file) dl = nemo_asr.AudioToTextDataLayer( manifest_filepath=self.manifest_filepath, labels=self.labels, batch_size=4, ) pre_process_params = { 'frame_splicing': 1, 'features': 64, 'window_size': 0.02, 'n_fft': 512, 'dither': 1e-05, 'window': 'hann', 'sample_rate': 16000, 'normalize': 'per_feature', 'window_stride': 0.01, 'stft_conv': True, } preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor( **pre_process_params) jasper_encoder = nemo_asr.JasperEncoder( feat_in=jasper_model_definition[ 'AudioToMelSpectrogramPreprocessor']['features'], **jasper_model_definition['JasperEncoder'], ) jasper_decoder = nemo_asr.JasperDecoderForCTC(feat_in=1024, num_classes=len( self.labels)) ctc_loss = nemo_asr.CTCLossNM(num_classes=len(self.labels)) # DAG audio_signal, a_sig_length, transcript, transcript_len = dl() processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length) encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length) # logging.info(jasper_encoder) log_probs = jasper_decoder(encoder_output=encoded) loss = ctc_loss( log_probs=log_probs, targets=transcript, input_length=encoded_len, target_length=transcript_len, ) callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss], print_func=lambda x: logging.info(str(x[0].item()))) # Instantiate an optimizer to perform `train` action optimizer = self.nf.get_trainer() optimizer.train( [loss], callbacks=[callback], optimizer="sgd", optimization_params={ "num_epochs": 10, "lr": 0.0003 }, )
def create_all_dags(args, neural_factory): yaml = YAML(typ="safe") with open(args.model_config) as f: jasper_params = yaml.load(f) vocab = jasper_params["labels"] sample_rate = jasper_params["sample_rate"] # Calculate num_workers for dataloader total_cpus = os.cpu_count() cpu_per_traindl = max(int(total_cpus / neural_factory.world_size), 1) # perturb_config = jasper_params.get('perturb', None) train_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"]) train_dl_params.update(jasper_params["AudioToTextDataLayer"]["train"]) del train_dl_params["train"] del train_dl_params["eval"] # del train_dl_params["normalize_transcripts"] if args.dataset: d_path = Path(args.dataset) if not args.train_dataset: args.train_dataset = str(d_path / Path("train_manifest.json")) if not args.eval_datasets: args.eval_datasets = [str(d_path / Path("test_manifest.json"))] data_loader_layer = nemo_asr.AudioToTextDataLayer if args.remote_data: train_dl_params["rpyc_host"] = args.remote_data data_loader_layer = RpycAudioToTextDataLayer # data_layer = data_loader_layer( # manifest_filepath=args.train_dataset, # sample_rate=sample_rate, # labels=vocab, # batch_size=args.batch_size, # num_workers=cpu_per_traindl, # **train_dl_params, # # normalize_transcripts=False # ) # # N = len(data_layer) # steps_per_epoch = math.ceil( # N / (args.batch_size * args.iter_per_step * args.num_gpus) # ) # logging.info("Have {0} examples to train on.".format(N)) # data_preprocessor = nemo_asr.AudioToMelSpectrogramPreprocessor( sample_rate=sample_rate, **jasper_params["AudioToMelSpectrogramPreprocessor"]) # multiply_batch_config = jasper_params.get("MultiplyBatch", None) # if multiply_batch_config: # multiply_batch = nemo_asr.MultiplyBatch(**multiply_batch_config) # # spectr_augment_config = jasper_params.get("SpectrogramAugmentation", None) # if spectr_augment_config: # data_spectr_augmentation = nemo_asr.SpectrogramAugmentation( # **spectr_augment_config # ) # eval_dl_params = copy.deepcopy(jasper_params["AudioToTextDataLayer"]) eval_dl_params.update(jasper_params["AudioToTextDataLayer"]["eval"]) if args.remote_data: eval_dl_params["rpyc_host"] = args.remote_data del eval_dl_params["train"] del eval_dl_params["eval"] data_layers_eval = [] # if args.eval_datasets: for eval_datasets in args.eval_datasets: data_layer_eval = data_loader_layer( manifest_filepath=eval_datasets, sample_rate=sample_rate, labels=vocab, batch_size=args.eval_batch_size, num_workers=cpu_per_traindl, **eval_dl_params, ) data_layers_eval.append(data_layer_eval) # else: # logging.warning("There were no val datasets passed") jasper_encoder = nemo_asr.JasperEncoder( feat_in=jasper_params["AudioToMelSpectrogramPreprocessor"]["features"], **jasper_params["JasperEncoder"], ) jasper_encoder.restore_from(args.encoder_checkpoint, local_rank=0) jasper_decoder = nemo_asr.JasperDecoderForCTC( feat_in=jasper_params["JasperEncoder"]["jasper"][-1]["filters"], num_classes=len(vocab), ) jasper_decoder.restore_from(args.decoder_checkpoint, local_rank=0) ctc_loss = nemo_asr.CTCLossNM(num_classes=len(vocab)) greedy_decoder = nemo_asr.GreedyCTCDecoder() # logging.info("================================") # logging.info(f"Number of parameters in encoder: {jasper_encoder.num_weights}") # logging.info(f"Number of parameters in decoder: {jasper_decoder.num_weights}") # logging.info( # f"Total number of parameters in model: " # f"{jasper_decoder.num_weights + jasper_encoder.num_weights}" # ) # logging.info("================================") # # # Train DAG # (audio_signal_t, a_sig_length_t, transcript_t, transcript_len_t) = data_layer() # processed_signal_t, p_length_t = data_preprocessor( # input_signal=audio_signal_t, length=a_sig_length_t # ) # # if multiply_batch_config: # ( # processed_signal_t, # p_length_t, # transcript_t, # transcript_len_t, # ) = multiply_batch( # in_x=processed_signal_t, # in_x_len=p_length_t, # in_y=transcript_t, # in_y_len=transcript_len_t, # ) # # if spectr_augment_config: # processed_signal_t = data_spectr_augmentation(input_spec=processed_signal_t) # # encoded_t, encoded_len_t = jasper_encoder( # audio_signal=processed_signal_t, length=p_length_t # ) # log_probs_t = jasper_decoder(encoder_output=encoded_t) # predictions_t = greedy_decoder(log_probs=log_probs_t) # loss_t = ctc_loss( # log_probs=log_probs_t, # targets=transcript_t, # input_length=encoded_len_t, # target_length=transcript_len_t, # ) # # # Callbacks needed to print info to console and Tensorboard # train_callback = nemo.core.SimpleLossLoggerCallback( # tensors=[loss_t, predictions_t, transcript_t, transcript_len_t], # print_func=partial(monitor_asr_train_progress, labels=vocab), # get_tb_values=lambda x: [("loss", x[0])], # tb_writer=neural_factory.tb_writer, # ) # # chpt_callback = nemo.core.CheckpointCallback( # folder=neural_factory.checkpoint_dir, # load_from_folder=args.load_dir, # step_freq=args.checkpoint_save_freq, # checkpoints_to_keep=30, # ) # # callbacks = [train_callback, chpt_callback] callbacks = [] # assemble eval DAGs for i, eval_dl in enumerate(data_layers_eval): (audio_signal_e, a_sig_length_e, transcript_e, transcript_len_e) = eval_dl() processed_signal_e, p_length_e = data_preprocessor( input_signal=audio_signal_e, length=a_sig_length_e) encoded_e, encoded_len_e = jasper_encoder( audio_signal=processed_signal_e, length=p_length_e) log_probs_e = jasper_decoder(encoder_output=encoded_e) predictions_e = greedy_decoder(log_probs=log_probs_e) loss_e = ctc_loss( log_probs=log_probs_e, targets=transcript_e, input_length=encoded_len_e, target_length=transcript_len_e, ) # create corresponding eval callback tagname = os.path.basename(args.eval_datasets[i]).split(".")[0] eval_callback = nemo.core.EvaluatorCallback( eval_tensors=[ loss_e, predictions_e, transcript_e, transcript_len_e ], user_iter_callback=partial(process_evaluation_batch, labels=vocab), user_epochs_done_callback=partial(process_evaluation_epoch, tag=tagname), eval_step=args.eval_freq, tb_writer=neural_factory.tb_writer, ) callbacks.append(eval_callback) return callbacks
def test_quartznet_speaker_reco_training(self): """Integtaion test that instantiates a small QuartzNet model for speaker recognition and tests training with the sample an4 data. Training is run for 3 forward and backward steps and asserts that loss after 3 steps is smaller than the loss at the first step. """ with open( os.path.abspath( os.path.join(os.path.dirname(__file__), "../data/quartznet_spkr_test.yaml"))) as file: spkr_params = self.yaml.load(file) dl = nemo_asr.AudioToSpeechLabelDataLayer( manifest_filepath=self.manifest_filepath, labels=None, batch_size=10, ) sample_rate = 16000 preprocessing = nemo_asr.AudioToMelSpectrogramPreprocessor( sample_rate=sample_rate, **spkr_params["AudioToMelSpectrogramPreprocessor"], ) jasper_encoder = nemo_asr.JasperEncoder(**spkr_params['JasperEncoder']) jasper_decoder = nemo_asr.JasperDecoderForSpkrClass( feat_in=spkr_params['JasperEncoder']['jasper'][-1]['filters'], num_classes=dl.num_classes, pool_mode=spkr_params['JasperDecoderForSpkrClass']['pool_mode'], emb_sizes=spkr_params["JasperDecoderForSpkrClass"] ["emb_sizes"].split(","), ) ce_loss = nemo_asr.CrossEntropyLossNM() # DAG audio_signal, a_sig_length, targets, targets_len = dl() processed_signal, p_length = preprocessing(input_signal=audio_signal, length=a_sig_length) encoded, encoded_len = jasper_encoder(audio_signal=processed_signal, length=p_length) # logging.info(jasper_encoder) log_probs, _ = jasper_decoder(encoder_output=encoded) loss = ce_loss(logits=log_probs, labels=targets) loss_list = [] callback = nemo.core.SimpleLossLoggerCallback( tensors=[loss], print_func=partial(self.print_and_log_loss, loss_log_list=loss_list), step_freq=1) self.nf.random_seed = 42 self.nf.train( [loss], callbacks=[callback], optimizer="sgd", optimization_params={ "max_steps": 4, "lr": 0.002 }, ) self.nf.reset_trainer() # Assert that training loss went down assert loss_list[-1] < loss_list[0]