Beispiel #1
0
    def test_tokenizer_equivalence_en_de(self):
        en_de_tokenizer = MarianTokenizer.from_pretrained(f"{ORG_NAME}opus-mt-en-de")
        batch = en_de_tokenizer.prepare_seq2seq_batch(["I am a small frog"], return_tensors=None)
        self.assertIsInstance(batch, BatchEncoding)
        expected = [38, 121, 14, 697, 38848, 0]
        self.assertListEqual(expected, batch.input_ids[0])

        save_dir = tempfile.mkdtemp()
        en_de_tokenizer.save_pretrained(save_dir)
        contents = [x.name for x in Path(save_dir).glob("*")]
        self.assertIn("source.spm", contents)
        MarianTokenizer.from_pretrained(save_dir)
 def test_tokenizer_equivalence_en_de(self):
     en_de_tokenizer = MarianTokenizer.from_pretrained(
         f"{ORG_NAME}opus-mt-en-de")
     batch = en_de_tokenizer.prepare_translation_batch(
         ["I am a small frog"], return_tensors=None)
     self.assertIsInstance(batch, BatchEncoding)
     expected = [38, 121, 14, 697, 38848, 0]
     self.assertListEqual(expected, batch.input_ids[0])
Beispiel #3
0
    def setUp(self):
        super().setUp()
        vocab = ["</s>", "<unk>", "▁This", "▁is", "▁a", "▁t", "est", "\u0120", "<pad>"]
        vocab_tokens = dict(zip(vocab, range(len(vocab))))
        save_dir = Path(self.tmpdirname)
        save_json(vocab_tokens, save_dir / vocab_files_names["vocab"])
        save_json(mock_tokenizer_config, save_dir / vocab_files_names["tokenizer_config_file"])
        if not (save_dir / vocab_files_names["source_spm"]).exists():
            copyfile(SAMPLE_SP, save_dir / vocab_files_names["source_spm"])
            copyfile(SAMPLE_SP, save_dir / vocab_files_names["target_spm"])

        tokenizer = MarianTokenizer.from_pretrained(self.tmpdirname)
        tokenizer.save_pretrained(self.tmpdirname)
 def get_tokenizer(self, **kwargs) -> MarianTokenizer:
     return MarianTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 def get_tokenizer(self, max_len=None, **kwargs) -> MarianTokenizer:
     # overwrite max_len=512 default
     return MarianTokenizer.from_pretrained(self.tmpdirname, max_len=max_len, **kwargs)
 def get_tokenizer(self, max_len=None, **kwargs) -> MarianTokenizer:
     return MarianTokenizer.from_pretrained(self.tmpdirname,
                                            model_max_length=max_len,
                                            **kwargs)