def test_save_restore_artifact(self, asr_model):
        with tempfile.TemporaryDirectory() as tmpdir:
            save_path = os.path.join(tmpdir, 'ctc_bpe.nemo')
            asr_model.train()
            asr_model.save_to(save_path)

            new_model = EncDecCTCModelBPE.restore_from(save_path)
            assert isinstance(new_model, type(asr_model))
            assert new_model.vocab_path == 'vocab.txt'

            assert len(new_model.tokenizer.tokenizer.get_vocab()) == 128
Example #2
0
    def test_save_restore_artifact(self, asr_model):
        asr_model.train()
        asr_model.save_to('./ctc_bpe.nemo')

        new_model = EncDecCTCModelBPE.restore_from('./ctc_bpe.nemo')
        assert isinstance(new_model, type(asr_model))
        assert new_model.vocab_path == 'vocab.txt'

        assert len(new_model.tokenizer.tokenizer.get_vocab()) == 128

        if os.path.exists('./ctc_bpe.nemo'):
            os.remove('./ctc_bpe.nemo')
    def test_save_restore_artifact_spe(self, asr_model, test_data_dir):
        with tempfile.TemporaryDirectory() as tmpdir:
            tokenizer_dir = os.path.join(test_data_dir, "asr", "tokenizers", "an4_spe_128")
            asr_model.change_vocabulary(new_tokenizer_dir=tokenizer_dir, new_tokenizer_type='bpe')

            save_path = os.path.join(tmpdir, 'ctc_bpe.nemo')
            asr_model.train()
            asr_model.save_to(save_path)

            new_model = EncDecCTCModelBPE.restore_from(save_path)
            assert isinstance(new_model, type(asr_model))
            assert isinstance(new_model.tokenizer, tokenizers.SentencePieceTokenizer)
            assert new_model.model_path.endswith('_tokenizer.model')
            assert new_model.vocab_path.endswith('_vocab.txt')
            assert new_model.spe_vocab_path.endswith('_tokenizer.vocab')

            assert new_model.tokenizer.tokenizer.vocab_size == 128
            assert len(new_model.tokenizer.tokenizer.get_vocab()) == 128
Example #4
0
    def test_save_restore_artifact_agg(self, asr_model, test_data_dir):
        tokenizer_dir = os.path.join(test_data_dir, "asr", "tokenizers", "an4_spe_128")
        tok_en = {"dir": tokenizer_dir, "type": "wpe"}
        # the below is really an english tokenizer but we pretend it is spanish
        tok_es = {"dir": tokenizer_dir, "type": "wpe"}
        tcfg = DictConfig({"type": "agg", "langs": {"en": tok_en, "es": tok_es}})
        with tempfile.TemporaryDirectory() as tmpdir:
            asr_model.change_vocabulary(new_tokenizer_dir=tcfg, new_tokenizer_type="agg")

            save_path = os.path.join(tmpdir, "ctc_agg.nemo")
            asr_model.train()
            asr_model.save_to(save_path)

            new_model = EncDecCTCModelBPE.restore_from(save_path)
            assert isinstance(new_model, type(asr_model))
            assert isinstance(new_model.tokenizer, tokenizers.AggregateTokenizer)

            # should be double
            assert new_model.tokenizer.tokenizer.vocab_size == 254
            assert len(new_model.tokenizer.tokenizer.get_vocab()) == 254
    def test_vocab_change(self, test_data_dir, asr_model):
        old_vocab = copy.deepcopy(asr_model.decoder.vocabulary)

        with tempfile.TemporaryDirectory() as save_dir:
            save_path = os.path.join(save_dir, 'temp.nemo')

            with tempfile.TemporaryDirectory() as tmpdir:
                old_tmpdir_path = tmpdir

                old_tokenizer_dir = os.path.join(test_data_dir, "asr",
                                                 "tokenizers", "an4_wpe_128",
                                                 'vocab.txt')
                new_tokenizer_dir = os.path.join(tmpdir, 'tokenizer')

                os.makedirs(new_tokenizer_dir, exist_ok=True)
                shutil.copy2(old_tokenizer_dir, new_tokenizer_dir)

                nw1 = asr_model.num_weights
                asr_model.change_vocabulary(
                    new_tokenizer_dir=new_tokenizer_dir,
                    new_tokenizer_type='wpe')
                # No change
                assert nw1 == asr_model.num_weights

                with open(os.path.join(new_tokenizer_dir, 'vocab.txt'),
                          'a+') as f:
                    f.write("!\n")
                    f.write('$\n')
                    f.write('@\n')

                asr_model.change_vocabulary(
                    new_tokenizer_dir=new_tokenizer_dir,
                    new_tokenizer_type='wpe')
                # fully connected + bias
                assert asr_model.num_weights == nw1 + 3 * (
                    asr_model.decoder._feat_in + 1)

                new_vocab = copy.deepcopy(asr_model.decoder.vocabulary)
                assert len(old_vocab) != len(new_vocab)

                # save the model (after change of vocabulary)
                asr_model.save_to(save_path)
                assert os.path.exists(save_path)
                # delete copied version of the vocabulary from nested tmpdir (by scope)

            # assert copied vocab no longer exists
            assert not os.path.exists(
                os.path.join(old_tmpdir_path, 'tokenizer', 'vocab.txt'))

            # make a copy of the tokenizer before renaming
            try:
                os.rename(old_tokenizer_dir, old_tokenizer_dir + '.bkp')
                assert not os.path.exists(old_tokenizer_dir)

                # restore model from .nemo
                asr_model2 = EncDecCTCModelBPE.restore_from(save_path)
                assert isinstance(asr_model2, EncDecCTCModelBPE)

                # Check if vocabulary size is same
                assert asr_model.tokenizer.tokenizer.vocab_size == asr_model2.tokenizer.tokenizer.vocab_size

                # Make a copy of the tokenizer
                new_tokenizer_dir = os.path.join(save_dir, 'tokenizer')

                os.makedirs(new_tokenizer_dir, exist_ok=True)
                new_tokenizer_path = os.path.join(new_tokenizer_dir,
                                                  'vocab.txt')
                with open(new_tokenizer_path, 'w') as f:
                    for v in asr_model2.tokenizer.tokenizer.get_vocab():
                        f.write(f"{v}\n")

                    # Add some new tokens too
                    f.write("^\n")
                    f.write("^^\n")
                    f.write("^^^\n")

                assert os.path.exists(new_tokenizer_path)

                # change vocabulary
                asr_model2.change_vocabulary(new_tokenizer_dir,
                                             new_tokenizer_type='wpe')
                assert asr_model.tokenizer.vocab_size != asr_model2.tokenizer.vocab_size

                new_save_path = os.path.join(save_dir, 'temp2.nemo')
                asr_model2.save_to(new_save_path)

                asr_model3 = EncDecCTCModelBPE.restore_from(new_save_path)
                assert isinstance(asr_model3, EncDecCTCModelBPE)

                # Check if vocabulary size is same
                assert asr_model2.tokenizer.tokenizer.vocab_size == asr_model3.tokenizer.tokenizer.vocab_size
                assert asr_model2.tokenizer_dir != asr_model3.tokenizer_dir

                # Model PT level checks
                assert len(asr_model2.artifacts) == 1

            finally:
                os.rename(old_tokenizer_dir + '.bkp', old_tokenizer_dir)