Esempio n. 1
0
    def testOpenNMTTokenizerAssets(self):
        asset_dir = self.get_temp_dir()
        # Write a dummy SentencePiece model.
        sp_model_path = os.path.join(asset_dir, "model.sp")
        with open(sp_model_path, "wb") as sp_model_file:
            sp_model_file.write(b"some model data\n")

        tokenizer = OpenNMTTokenizer(params={
            "mode": "none",
            "sp_model_path": sp_model_path
        })

        # By default, no assets are returned.
        assets = tokenizer.initialize({})
        self.assertDictEqual(assets, {})

        # Generated assets are prefixed but not existing resources.
        assets = tokenizer.initialize({},
                                      asset_dir=asset_dir,
                                      asset_prefix="source_")
        self.assertIn("source_tokenizer_config.yml", assets)
        self.assertTrue(os.path.exists(assets["source_tokenizer_config.yml"]))
        self.assertIn("model.sp", assets)
        self.assertTrue(os.path.exists(assets["model.sp"]))

        # The tokenization configuration should not contain absolute paths to resources.
        with open(assets["source_tokenizer_config.yml"], "rb") as config_file:
            asset_config = yaml.load(config_file.read())
        self.assertDictEqual(asset_config, {
            "mode": "none",
            "sp_model_path": "model.sp"
        })
Esempio n. 2
0
    def testOpenNMTTokenizerFromConfiguration(self):
        params = {
            "mode": "aggressive",
            "spacer_annotate": True,
            "spacer_new": True
        }
        tok_config = os.path.join(self.get_temp_dir(), "tok_config.yml")
        with open(tok_config, "w") as tok_config_file:
            yaml.dump(params, tok_config_file)

        def _test(tokenizer):
            self._testTokenizer(tokenizer, "Hello World-s",
                                ["Hello", "▁", "World", "-", "s"])

        tokenizer = OpenNMTTokenizer(configuration_file_or_key=tok_config)
        _test(tokenizer)
        tokenizer = OpenNMTTokenizer(
            configuration_file_or_key="source_tokenization")
        tokenizer.initialize({"source_tokenization": tok_config})
        _test(tokenizer)
        tokenizer = OpenNMTTokenizer(
            configuration_file_or_key="source_tokenization")
        tokenizer.initialize({"source_tokenization": params})
        _test(tokenizer)
        tokenizer = OpenNMTTokenizer(params=params)
        _test(tokenizer)
Esempio n. 3
0
  def testOpenNMTTokenizerAssets(self):
    asset_dir = self.get_temp_dir()
    # Write a dummy BPE model.
    bpe_model_path = os.path.join(asset_dir, "model.bpe")
    with open(bpe_model_path, "wb") as bpe_model_file:
      bpe_model_file.write(b"#version: 0.2\ne s</w>\n")

    tokenizer = OpenNMTTokenizer(
        params={"mode": "conservative", "bpe_model_path": bpe_model_path})

    # By default, no assets are returned.
    assets = tokenizer.initialize({})
    self.assertDictEqual(assets, {})

    # Generated assets are prefixed but not existing resources.
    assets = tokenizer.initialize({}, asset_dir=asset_dir, asset_prefix="source_")
    self.assertIn("source_tokenizer_config.yml", assets)
    self.assertTrue(os.path.exists(assets["source_tokenizer_config.yml"]))
    self.assertIn("model.bpe", assets)
    self.assertTrue(os.path.exists(assets["model.bpe"]))

    # The tokenization configuration should not contain absolute paths to resources.
    with open(assets["source_tokenizer_config.yml"], "rb") as config_file:
      asset_config = yaml.load(config_file.read(), Loader=yaml.UnsafeLoader)
    self.assertDictEqual(asset_config, {"mode": "conservative", "bpe_model_path": "model.bpe"})