def test_tokenizer_equivalence_en_de(self): en_de_tokenizer = MarianTokenizer.from_pretrained(f"{ORG_NAME}opus-mt-en-de") batch = en_de_tokenizer.prepare_seq2seq_batch(["I am a small frog"], return_tensors=None) self.assertIsInstance(batch, BatchEncoding) expected = [38, 121, 14, 697, 38848, 0] self.assertListEqual(expected, batch.input_ids[0]) save_dir = tempfile.mkdtemp() en_de_tokenizer.save_pretrained(save_dir) contents = [x.name for x in Path(save_dir).glob("*")] self.assertIn("source.spm", contents) MarianTokenizer.from_pretrained(save_dir)
def test_tokenizer_equivalence_en_de(self): en_de_tokenizer = MarianTokenizer.from_pretrained( f"{ORG_NAME}opus-mt-en-de") batch = en_de_tokenizer.prepare_translation_batch( ["I am a small frog"], return_tensors=None) self.assertIsInstance(batch, BatchEncoding) expected = [38, 121, 14, 697, 38848, 0] self.assertListEqual(expected, batch.input_ids[0])
def setUp(self): super().setUp() vocab = ["</s>", "<unk>", "▁This", "▁is", "▁a", "▁t", "est", "\u0120", "<pad>"] vocab_tokens = dict(zip(vocab, range(len(vocab)))) save_dir = Path(self.tmpdirname) save_json(vocab_tokens, save_dir / vocab_files_names["vocab"]) save_json(mock_tokenizer_config, save_dir / vocab_files_names["tokenizer_config_file"]) if not (save_dir / vocab_files_names["source_spm"]).exists(): copyfile(SAMPLE_SP, save_dir / vocab_files_names["source_spm"]) copyfile(SAMPLE_SP, save_dir / vocab_files_names["target_spm"]) tokenizer = MarianTokenizer.from_pretrained(self.tmpdirname) tokenizer.save_pretrained(self.tmpdirname)
def get_tokenizer(self, **kwargs) -> MarianTokenizer: return MarianTokenizer.from_pretrained(self.tmpdirname, **kwargs)
def get_tokenizer(self, max_len=None, **kwargs) -> MarianTokenizer: # overwrite max_len=512 default return MarianTokenizer.from_pretrained(self.tmpdirname, max_len=max_len, **kwargs)
def get_tokenizer(self, max_len=None, **kwargs) -> MarianTokenizer: return MarianTokenizer.from_pretrained(self.tmpdirname, model_max_length=max_len, **kwargs)