Exemple #1
0
    def testOpenNMTTokenizerAssets(self):
        asset_dir = self.get_temp_dir()
        # Write a dummy BPE model.
        bpe_model_path = os.path.join(asset_dir, "model.bpe")
        with open(bpe_model_path, "wb") as bpe_model_file:
            bpe_model_file.write(b"#version: 0.2\ne s</w>\n")

        tokenizer = OpenNMTTokenizer(mode="conservative",
                                     bpe_model_path=bpe_model_path)

        # Generated assets are prefixed but not existing resources.
        assets = tokenizer.export_assets(asset_dir, asset_prefix="source_")
        self.assertIn("source_tokenizer_config.yml", assets)
        self.assertTrue(os.path.exists(assets["source_tokenizer_config.yml"]))
        self.assertIn("model.bpe", assets)
        self.assertTrue(os.path.exists(assets["model.bpe"]))

        # The tokenization configuration should not contain absolute paths to resources.
        with open(assets["source_tokenizer_config.yml"], "rb") as config_file:
            asset_config = yaml.load(config_file.read(),
                                     Loader=yaml.UnsafeLoader)
        self.assertDictEqual(asset_config, {
            "mode": "conservative",
            "bpe_model_path": "model.bpe"
        })
Exemple #2
0
 def testOpenNMTTokenizer(self):
     self._testTokenizer(OpenNMTTokenizer(), "Hello world!",
                         ["Hello", "world", "!"])
     self._testDetokenizer(
         OpenNMTTokenizer(),
         [["Hello", "world", "■!"], ["Test"], ["My", "name"]],
         ["Hello world!", "Test", "My name"])
    def testOpenNMTTokenizerAssets(self):
        asset_dir = self.get_temp_dir()
        # Write a dummy SentencePiece model.
        sp_model_path = os.path.join(asset_dir, "model.sp")
        with open(sp_model_path, "wb") as sp_model_file:
            sp_model_file.write(b"some model data\n")

        tokenizer = OpenNMTTokenizer(params={
            "mode": "none",
            "sp_model_path": sp_model_path
        })

        # By default, no assets are returned.
        assets = tokenizer.initialize({})
        self.assertDictEqual(assets, {})

        # Generated assets are prefixed but not existing resources.
        assets = tokenizer.initialize({},
                                      asset_dir=asset_dir,
                                      asset_prefix="source_")
        self.assertIn("source_tokenizer_config.yml", assets)
        self.assertTrue(os.path.exists(assets["source_tokenizer_config.yml"]))
        self.assertIn("model.sp", assets)
        self.assertTrue(os.path.exists(assets["model.sp"]))

        # The tokenization configuration should not contain absolute paths to resources.
        with open(assets["source_tokenizer_config.yml"], "rb") as config_file:
            asset_config = yaml.load(config_file.read())
        self.assertDictEqual(asset_config, {
            "mode": "none",
            "sp_model_path": "model.sp"
        })
  def testOpenNMTTokenizer(self):
    self._testTokenizer(OpenNMTTokenizer(), "Hello world!", ["Hello", "world", "!"])

    tok_config = os.path.join(self.get_temp_dir(), "tok_config.yml")
    with open(tok_config, "wb") as tok_config_file:
      tok_config_file.write(b"mode: aggressive\n"
                            b"spacer_annotate: true\n"
                            b"spacer_new: true\n")
    self._testTokenizer(OpenNMTTokenizer(configuration_file_or_key=tok_config),
                        "Hello World-s", ["Hello", "▁", "World", "-", "s"])

    self._testDetokenizer(
        OpenNMTTokenizer(),
        [["Hello", "world", "■!"], ["Test"], ["My", "name"]],
        ["Hello world!", "Test", "My name"])
Exemple #5
0
 def testOpenNMTTokenizerArguments(self):
     tokenizer = OpenNMTTokenizer(mode="aggressive",
                                  spacer_annotate=True,
                                  spacer_new=True)
     self._testTokenizer(tokenizer, "Hello World-s",
                         ["Hello", "▁", "World", "-", "s"])
Exemple #6
0
    def testOpenNMTTokenizerFromConfiguration(self):
        params = {
            "mode": "aggressive",
            "spacer_annotate": True,
            "spacer_new": True
        }
        tok_config = os.path.join(self.get_temp_dir(), "tok_config.yml")
        with open(tok_config, "w") as tok_config_file:
            yaml.dump(params, tok_config_file)

        def _test(tokenizer):
            self._testTokenizer(tokenizer, "Hello World-s",
                                ["Hello", "▁", "World", "-", "s"])

        tokenizer = OpenNMTTokenizer(configuration_file_or_key=tok_config)
        _test(tokenizer)
        tokenizer = OpenNMTTokenizer(
            configuration_file_or_key="source_tokenization")
        tokenizer.initialize({"source_tokenization": tok_config})
        _test(tokenizer)
        tokenizer = OpenNMTTokenizer(
            configuration_file_or_key="source_tokenization")
        tokenizer.initialize({"source_tokenization": params})
        _test(tokenizer)
        tokenizer = OpenNMTTokenizer(params=params)
        _test(tokenizer)