def test_save_restore_artifact(self, asr_model): with tempfile.TemporaryDirectory() as tmpdir: save_path = os.path.join(tmpdir, 'ctc_bpe.nemo') asr_model.train() asr_model.save_to(save_path) new_model = EncDecCTCModelBPE.restore_from(save_path) assert isinstance(new_model, type(asr_model)) assert new_model.vocab_path == 'vocab.txt' assert len(new_model.tokenizer.tokenizer.get_vocab()) == 128
def test_save_restore_artifact(self, asr_model): asr_model.train() asr_model.save_to('./ctc_bpe.nemo') new_model = EncDecCTCModelBPE.restore_from('./ctc_bpe.nemo') assert isinstance(new_model, type(asr_model)) assert new_model.vocab_path == 'vocab.txt' assert len(new_model.tokenizer.tokenizer.get_vocab()) == 128 if os.path.exists('./ctc_bpe.nemo'): os.remove('./ctc_bpe.nemo')
def test_save_restore_artifact_spe(self, asr_model, test_data_dir): with tempfile.TemporaryDirectory() as tmpdir: tokenizer_dir = os.path.join(test_data_dir, "asr", "tokenizers", "an4_spe_128") asr_model.change_vocabulary(new_tokenizer_dir=tokenizer_dir, new_tokenizer_type='bpe') save_path = os.path.join(tmpdir, 'ctc_bpe.nemo') asr_model.train() asr_model.save_to(save_path) new_model = EncDecCTCModelBPE.restore_from(save_path) assert isinstance(new_model, type(asr_model)) assert isinstance(new_model.tokenizer, tokenizers.SentencePieceTokenizer) assert new_model.model_path.endswith('_tokenizer.model') assert new_model.vocab_path.endswith('_vocab.txt') assert new_model.spe_vocab_path.endswith('_tokenizer.vocab') assert new_model.tokenizer.tokenizer.vocab_size == 128 assert len(new_model.tokenizer.tokenizer.get_vocab()) == 128
def test_save_restore_artifact_agg(self, asr_model, test_data_dir): tokenizer_dir = os.path.join(test_data_dir, "asr", "tokenizers", "an4_spe_128") tok_en = {"dir": tokenizer_dir, "type": "wpe"} # the below is really an english tokenizer but we pretend it is spanish tok_es = {"dir": tokenizer_dir, "type": "wpe"} tcfg = DictConfig({"type": "agg", "langs": {"en": tok_en, "es": tok_es}}) with tempfile.TemporaryDirectory() as tmpdir: asr_model.change_vocabulary(new_tokenizer_dir=tcfg, new_tokenizer_type="agg") save_path = os.path.join(tmpdir, "ctc_agg.nemo") asr_model.train() asr_model.save_to(save_path) new_model = EncDecCTCModelBPE.restore_from(save_path) assert isinstance(new_model, type(asr_model)) assert isinstance(new_model.tokenizer, tokenizers.AggregateTokenizer) # should be double assert new_model.tokenizer.tokenizer.vocab_size == 254 assert len(new_model.tokenizer.tokenizer.get_vocab()) == 254
def test_vocab_change(self, test_data_dir, asr_model): old_vocab = copy.deepcopy(asr_model.decoder.vocabulary) with tempfile.TemporaryDirectory() as save_dir: save_path = os.path.join(save_dir, 'temp.nemo') with tempfile.TemporaryDirectory() as tmpdir: old_tmpdir_path = tmpdir old_tokenizer_dir = os.path.join(test_data_dir, "asr", "tokenizers", "an4_wpe_128", 'vocab.txt') new_tokenizer_dir = os.path.join(tmpdir, 'tokenizer') os.makedirs(new_tokenizer_dir, exist_ok=True) shutil.copy2(old_tokenizer_dir, new_tokenizer_dir) nw1 = asr_model.num_weights asr_model.change_vocabulary( new_tokenizer_dir=new_tokenizer_dir, new_tokenizer_type='wpe') # No change assert nw1 == asr_model.num_weights with open(os.path.join(new_tokenizer_dir, 'vocab.txt'), 'a+') as f: f.write("!\n") f.write('$\n') f.write('@\n') asr_model.change_vocabulary( new_tokenizer_dir=new_tokenizer_dir, new_tokenizer_type='wpe') # fully connected + bias assert asr_model.num_weights == nw1 + 3 * ( asr_model.decoder._feat_in + 1) new_vocab = copy.deepcopy(asr_model.decoder.vocabulary) assert len(old_vocab) != len(new_vocab) # save the model (after change of vocabulary) asr_model.save_to(save_path) assert os.path.exists(save_path) # delete copied version of the vocabulary from nested tmpdir (by scope) # assert copied vocab no longer exists assert not os.path.exists( os.path.join(old_tmpdir_path, 'tokenizer', 'vocab.txt')) # make a copy of the tokenizer before renaming try: os.rename(old_tokenizer_dir, old_tokenizer_dir + '.bkp') assert not os.path.exists(old_tokenizer_dir) # restore model from .nemo asr_model2 = EncDecCTCModelBPE.restore_from(save_path) assert isinstance(asr_model2, EncDecCTCModelBPE) # Check if vocabulary size is same assert asr_model.tokenizer.tokenizer.vocab_size == asr_model2.tokenizer.tokenizer.vocab_size # Make a copy of the tokenizer new_tokenizer_dir = os.path.join(save_dir, 'tokenizer') os.makedirs(new_tokenizer_dir, exist_ok=True) new_tokenizer_path = os.path.join(new_tokenizer_dir, 'vocab.txt') with open(new_tokenizer_path, 'w') as f: for v in asr_model2.tokenizer.tokenizer.get_vocab(): f.write(f"{v}\n") # Add some new tokens too f.write("^\n") f.write("^^\n") f.write("^^^\n") assert os.path.exists(new_tokenizer_path) # change vocabulary asr_model2.change_vocabulary(new_tokenizer_dir, new_tokenizer_type='wpe') assert asr_model.tokenizer.vocab_size != asr_model2.tokenizer.vocab_size new_save_path = os.path.join(save_dir, 'temp2.nemo') asr_model2.save_to(new_save_path) asr_model3 = EncDecCTCModelBPE.restore_from(new_save_path) assert isinstance(asr_model3, EncDecCTCModelBPE) # Check if vocabulary size is same assert asr_model2.tokenizer.tokenizer.vocab_size == asr_model3.tokenizer.tokenizer.vocab_size assert asr_model2.tokenizer_dir != asr_model3.tokenizer_dir # Model PT level checks assert len(asr_model2.artifacts) == 1 finally: os.rename(old_tokenizer_dir + '.bkp', old_tokenizer_dir)