def testOpenNMTTokenizerAssets(self): asset_dir = self.get_temp_dir() # Write a dummy SentencePiece model. sp_model_path = os.path.join(asset_dir, "model.sp") with open(sp_model_path, "wb") as sp_model_file: sp_model_file.write(b"some model data\n") tokenizer = OpenNMTTokenizer(params={ "mode": "none", "sp_model_path": sp_model_path }) # By default, no assets are returned. assets = tokenizer.initialize({}) self.assertDictEqual(assets, {}) # Generated assets are prefixed but not existing resources. assets = tokenizer.initialize({}, asset_dir=asset_dir, asset_prefix="source_") self.assertIn("source_tokenizer_config.yml", assets) self.assertTrue(os.path.exists(assets["source_tokenizer_config.yml"])) self.assertIn("model.sp", assets) self.assertTrue(os.path.exists(assets["model.sp"])) # The tokenization configuration should not contain absolute paths to resources. with open(assets["source_tokenizer_config.yml"], "rb") as config_file: asset_config = yaml.load(config_file.read()) self.assertDictEqual(asset_config, { "mode": "none", "sp_model_path": "model.sp" })
def testOpenNMTTokenizerFromConfiguration(self): params = { "mode": "aggressive", "spacer_annotate": True, "spacer_new": True } tok_config = os.path.join(self.get_temp_dir(), "tok_config.yml") with open(tok_config, "w") as tok_config_file: yaml.dump(params, tok_config_file) def _test(tokenizer): self._testTokenizer(tokenizer, "Hello World-s", ["Hello", "▁", "World", "-", "s"]) tokenizer = OpenNMTTokenizer(configuration_file_or_key=tok_config) _test(tokenizer) tokenizer = OpenNMTTokenizer( configuration_file_or_key="source_tokenization") tokenizer.initialize({"source_tokenization": tok_config}) _test(tokenizer) tokenizer = OpenNMTTokenizer( configuration_file_or_key="source_tokenization") tokenizer.initialize({"source_tokenization": params}) _test(tokenizer) tokenizer = OpenNMTTokenizer(params=params) _test(tokenizer)
def testOpenNMTTokenizerAssets(self): asset_dir = self.get_temp_dir() # Write a dummy BPE model. bpe_model_path = os.path.join(asset_dir, "model.bpe") with open(bpe_model_path, "wb") as bpe_model_file: bpe_model_file.write(b"#version: 0.2\ne s</w>\n") tokenizer = OpenNMTTokenizer( params={"mode": "conservative", "bpe_model_path": bpe_model_path}) # By default, no assets are returned. assets = tokenizer.initialize({}) self.assertDictEqual(assets, {}) # Generated assets are prefixed but not existing resources. assets = tokenizer.initialize({}, asset_dir=asset_dir, asset_prefix="source_") self.assertIn("source_tokenizer_config.yml", assets) self.assertTrue(os.path.exists(assets["source_tokenizer_config.yml"])) self.assertIn("model.bpe", assets) self.assertTrue(os.path.exists(assets["model.bpe"])) # The tokenization configuration should not contain absolute paths to resources. with open(assets["source_tokenizer_config.yml"], "rb") as config_file: asset_config = yaml.load(config_file.read(), Loader=yaml.UnsafeLoader) self.assertDictEqual(asset_config, {"mode": "conservative", "bpe_model_path": "model.bpe"})