def main(cfg): logging.info(f'Hydra config: {cfg.pretty()}') trainer = pl.Trainer(**cfg.trainer) exp_manager(trainer, cfg.get("exp_manager", None)) asr_model = EncDecCTCModelBPE(cfg=cfg.model, trainer=trainer) trainer.fit(asr_model) if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: gpu = 1 if cfg.trainer.gpus != 0 else 0 trainer = pl.Trainer(gpus=gpu) if asr_model.prepare_test(trainer): trainer.test(asr_model)
def main(cfg): logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') trainer = pl.Trainer(**cfg.trainer) exp_manager(trainer, cfg.get("exp_manager", None)) asr_model = EncDecCTCModelBPE(cfg=cfg.model, trainer=trainer) # Initialize the weights of the model from another model, if provided via config asr_model.maybe_init_from_pretrained_checkpoint(cfg) trainer.fit(asr_model) if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: if asr_model.prepare_test(trainer): trainer.test(asr_model)
def test_constructor(self, asr_model): asr_model.train() # TODO: make proper config and assert correct number of weights # Check to/from config_dict: confdict = asr_model.to_config_dict() instance2 = EncDecCTCModelBPE.from_config_dict(confdict) assert isinstance(instance2, EncDecCTCModelBPE)
def main(cfg): logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') print(OmegaConf.to_yaml(cfg)) trainer = pl.Trainer(**cfg.trainer) exp_manager(trainer, cfg.get("exp_manager", None)) asr_model = EncDecCTCModelBPE(cfg=cfg.model, trainer=trainer) trainer.fit(asr_model) if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: gpu = 1 if cfg.trainer.gpus != 0 else 0 test_trainer = pl.Trainer( gpus=gpu, precision=trainer.precision, amp_level=trainer.amp_level, amp_backend=trainer.amp_backend, ) if asr_model.prepare_test(test_trainer): trainer.test(asr_model)
def test_save_restore_artifact(self, asr_model): with tempfile.TemporaryDirectory() as tmpdir: save_path = os.path.join(tmpdir, 'ctc_bpe.nemo') asr_model.train() asr_model.save_to(save_path) new_model = EncDecCTCModelBPE.restore_from(save_path) assert isinstance(new_model, type(asr_model)) assert new_model.vocab_path == 'vocab.txt' assert len(new_model.tokenizer.tokenizer.get_vocab()) == 128
def test_save_restore_artifact(self, asr_model): asr_model.train() asr_model.save_to('./ctc_bpe.nemo') new_model = EncDecCTCModelBPE.restore_from('./ctc_bpe.nemo') assert isinstance(new_model, type(asr_model)) assert new_model.vocab_path == 'vocab.txt' assert len(new_model.tokenizer.tokenizer.get_vocab()) == 128 if os.path.exists('./ctc_bpe.nemo'): os.remove('./ctc_bpe.nemo')
def main(cfg): logging.info(f'Hydra config: {OmegaConf.to_yaml(cfg)}') trainer = pl.Trainer(**cfg.trainer) exp_manager(trainer, cfg.get("exp_manager", None)) asr_model = EncDecCTCModelBPE(cfg=cfg.model, trainer=trainer) # Initialize the weights of the model from another model, if provided via config asr_model.maybe_init_from_pretrained_checkpoint(cfg) trainer.fit(asr_model) if hasattr(cfg.model, 'test_ds') and cfg.model.test_ds.manifest_filepath is not None: gpu = 1 if cfg.trainer.gpus != 0 else 0 test_trainer = pl.Trainer( gpus=gpu, precision=trainer.precision, amp_level=trainer.accelerator_connector.amp_level, amp_backend=cfg.trainer.get("amp_backend", "native"), ) if asr_model.prepare_test(test_trainer): test_trainer.test(asr_model)
def asr_model(test_data_dir): preprocessor = { '_target_': 'nemo.collections.asr.modules.AudioToMelSpectrogramPreprocessor' } encoder = { '_target_': 'nemo.collections.asr.modules.ConvASREncoder', 'feat_in': 64, 'activation': 'relu', 'conv_mask': True, 'jasper': [{ 'filters': 1024, 'repeat': 1, 'kernel': [1], 'stride': [1], 'dilation': [1], 'dropout': 0.0, 'residual': False, 'separable': True, 'se': True, 'se_context_size': -1, }], } decoder = { '_target_': 'nemo.collections.asr.modules.ConvASRDecoder', 'feat_in': 1024, 'num_classes': -1, 'vocabulary': None, } tokenizer = { 'dir': os.path.join(test_data_dir, "asr", "tokenizers", "an4_wpe_128"), 'type': 'wpe' } modelConfig = DictConfig({ 'preprocessor': DictConfig(preprocessor), 'encoder': DictConfig(encoder), 'decoder': DictConfig(decoder), 'tokenizer': DictConfig(tokenizer), }) model_instance = EncDecCTCModelBPE(cfg=modelConfig) return model_instance
def __init__( self, asr_model, frame_len=1.6, total_buffer=4.0, batch_size=4, ): ''' Args: frame_len: frame's duration, seconds frame_overlap: duration of overlaps before and after current frame, seconds offset: number of symbols to drop for smooth streaming ''' self.frame_bufferer = FeatureFrameBufferer(asr_model=asr_model, frame_len=frame_len, batch_size=batch_size, total_buffer=total_buffer) self.asr_model = asr_model self.batch_size = batch_size self.all_logits = [] self.all_preds = [] self.unmerged = [] if hasattr(asr_model.decoder, "vocabulary"): self.blank_id = len(asr_model.decoder.vocabulary) else: self.blank_id = len(asr_model.joint.vocabulary) self.tokenizer = asr_model.tokenizer self.toks_unmerged = [] self.frame_buffers = [] self.reset() cfg = copy.deepcopy(asr_model._cfg) self.frame_len = frame_len OmegaConf.set_struct(cfg.preprocessor, False) # some changes for streaming scenario cfg.preprocessor.dither = 0.0 cfg.preprocessor.pad_to = 0 cfg.preprocessor.normalize = "None" self.raw_preprocessor = EncDecCTCModelBPE.from_config_dict( cfg.preprocessor) self.raw_preprocessor.to(asr_model.device)
def test_save_restore_artifact_spe(self, asr_model, test_data_dir): with tempfile.TemporaryDirectory() as tmpdir: tokenizer_dir = os.path.join(test_data_dir, "asr", "tokenizers", "an4_spe_128") asr_model.change_vocabulary(new_tokenizer_dir=tokenizer_dir, new_tokenizer_type='bpe') save_path = os.path.join(tmpdir, 'ctc_bpe.nemo') asr_model.train() asr_model.save_to(save_path) new_model = EncDecCTCModelBPE.restore_from(save_path) assert isinstance(new_model, type(asr_model)) assert isinstance(new_model.tokenizer, tokenizers.SentencePieceTokenizer) assert new_model.model_path.endswith('_tokenizer.model') assert new_model.vocab_path.endswith('_vocab.txt') assert new_model.spe_vocab_path.endswith('_tokenizer.vocab') assert new_model.tokenizer.tokenizer.vocab_size == 128 assert len(new_model.tokenizer.tokenizer.get_vocab()) == 128
def test_save_restore_artifact_agg(self, asr_model, test_data_dir): tokenizer_dir = os.path.join(test_data_dir, "asr", "tokenizers", "an4_spe_128") tok_en = {"dir": tokenizer_dir, "type": "wpe"} # the below is really an english tokenizer but we pretend it is spanish tok_es = {"dir": tokenizer_dir, "type": "wpe"} tcfg = DictConfig({"type": "agg", "langs": {"en": tok_en, "es": tok_es}}) with tempfile.TemporaryDirectory() as tmpdir: asr_model.change_vocabulary(new_tokenizer_dir=tcfg, new_tokenizer_type="agg") save_path = os.path.join(tmpdir, "ctc_agg.nemo") asr_model.train() asr_model.save_to(save_path) new_model = EncDecCTCModelBPE.restore_from(save_path) assert isinstance(new_model, type(asr_model)) assert isinstance(new_model.tokenizer, tokenizers.AggregateTokenizer) # should be double assert new_model.tokenizer.tokenizer.vocab_size == 254 assert len(new_model.tokenizer.tokenizer.get_vocab()) == 254
def test_vocab_change(self, test_data_dir, asr_model): old_vocab = copy.deepcopy(asr_model.decoder.vocabulary) with tempfile.TemporaryDirectory() as save_dir: save_path = os.path.join(save_dir, 'temp.nemo') with tempfile.TemporaryDirectory() as tmpdir: old_tmpdir_path = tmpdir old_tokenizer_dir = os.path.join(test_data_dir, "asr", "tokenizers", "an4_wpe_128", 'vocab.txt') new_tokenizer_dir = os.path.join(tmpdir, 'tokenizer') os.makedirs(new_tokenizer_dir, exist_ok=True) shutil.copy2(old_tokenizer_dir, new_tokenizer_dir) nw1 = asr_model.num_weights asr_model.change_vocabulary( new_tokenizer_dir=new_tokenizer_dir, new_tokenizer_type='wpe') # No change assert nw1 == asr_model.num_weights with open(os.path.join(new_tokenizer_dir, 'vocab.txt'), 'a+') as f: f.write("!\n") f.write('$\n') f.write('@\n') asr_model.change_vocabulary( new_tokenizer_dir=new_tokenizer_dir, new_tokenizer_type='wpe') # fully connected + bias assert asr_model.num_weights == nw1 + 3 * ( asr_model.decoder._feat_in + 1) new_vocab = copy.deepcopy(asr_model.decoder.vocabulary) assert len(old_vocab) != len(new_vocab) # save the model (after change of vocabulary) asr_model.save_to(save_path) assert os.path.exists(save_path) # delete copied version of the vocabulary from nested tmpdir (by scope) # assert copied vocab no longer exists assert not os.path.exists( os.path.join(old_tmpdir_path, 'tokenizer', 'vocab.txt')) # make a copy of the tokenizer before renaming try: os.rename(old_tokenizer_dir, old_tokenizer_dir + '.bkp') assert not os.path.exists(old_tokenizer_dir) # restore model from .nemo asr_model2 = EncDecCTCModelBPE.restore_from(save_path) assert isinstance(asr_model2, EncDecCTCModelBPE) # Check if vocabulary size is same assert asr_model.tokenizer.tokenizer.vocab_size == asr_model2.tokenizer.tokenizer.vocab_size # Make a copy of the tokenizer new_tokenizer_dir = os.path.join(save_dir, 'tokenizer') os.makedirs(new_tokenizer_dir, exist_ok=True) new_tokenizer_path = os.path.join(new_tokenizer_dir, 'vocab.txt') with open(new_tokenizer_path, 'w') as f: for v in asr_model2.tokenizer.tokenizer.get_vocab(): f.write(f"{v}\n") # Add some new tokens too f.write("^\n") f.write("^^\n") f.write("^^^\n") assert os.path.exists(new_tokenizer_path) # change vocabulary asr_model2.change_vocabulary(new_tokenizer_dir, new_tokenizer_type='wpe') assert asr_model.tokenizer.vocab_size != asr_model2.tokenizer.vocab_size new_save_path = os.path.join(save_dir, 'temp2.nemo') asr_model2.save_to(new_save_path) asr_model3 = EncDecCTCModelBPE.restore_from(new_save_path) assert isinstance(asr_model3, EncDecCTCModelBPE) # Check if vocabulary size is same assert asr_model2.tokenizer.tokenizer.vocab_size == asr_model3.tokenizer.tokenizer.vocab_size assert asr_model2.tokenizer_dir != asr_model3.tokenizer_dir # Model PT level checks assert len(asr_model2.artifacts) == 1 finally: os.rename(old_tokenizer_dir + '.bkp', old_tokenizer_dir)