def testOpenNMTTokenizer(self):
     self._testTokenizer(
         tokenizers.OpenNMTTokenizer(), ["Hello world!", "How are you?"],
         [["Hello", "world", "!"], ["How", "are", "you", "?"]])
     self._testDetokenizer(
         tokenizers.OpenNMTTokenizer(),
         [["Hello", "world", "■!"], ["Test"], ["My", "name"]],
         ["Hello world!", "Test", "My name"])
 def testOpenNMTTokenizerArguments(self):
     with self.assertRaises(ValueError):
         tokenizers.OpenNMTTokenizer(case_feature=True)
     tokenizer = tokenizers.OpenNMTTokenizer(mode="aggressive",
                                             spacer_annotate=True,
                                             spacer_new=True,
                                             case_feature=False)
     self._testTokenizer(tokenizer, ["Hello World-s"],
                         [["Hello", "▁", "World", "-", "s"]])
    def testOpenNMTTokenizerAssets(self):
        asset_dir = self.get_temp_dir()
        # Write a dummy BPE model.
        bpe_model_path = os.path.join(asset_dir, "model.bpe")
        with open(bpe_model_path, "w") as bpe_model_file:
            bpe_model_file.write("#version: 0.2\ne s</w>\n")

        tokenizer = tokenizers.OpenNMTTokenizer(mode="conservative",
                                                bpe_model_path=bpe_model_path)

        # Generated assets are prefixed but not existing resources.
        assets = tokenizer.export_assets(asset_dir, asset_prefix="source_")
        self.assertIn("source_tokenizer_config.yml", assets)
        self.assertTrue(os.path.exists(assets["source_tokenizer_config.yml"]))
        self.assertIn("model.bpe", assets)
        self.assertTrue(os.path.exists(assets["model.bpe"]))

        # The tokenization configuration should not contain absolute paths to resources.
        with open(assets["source_tokenizer_config.yml"], "rb") as config_file:
            asset_config = yaml.load(config_file.read(),
                                     Loader=yaml.UnsafeLoader)
        self.assertDictEqual(asset_config, {
            "mode": "conservative",
            "bpe_model_path": "model.bpe"
        })
 def testOpenNMTTokenizerEmptyTensor(self):
     tokenizer = tokenizers.OpenNMTTokenizer()
     tokens = tokenizer.tokenize(tf.constant(""))
     self.assertIs(tokens.dtype, tf.string)
     self.assertListEqual(tokens.shape.as_list(), [0])
     text = tokenizer.detokenize(tokens)
     self.assertIs(text.dtype, tf.string)
     self.assertListEqual(text.shape.as_list(), [])
Esempio n. 5
0
 def testOpenNMTTokenizerInferenceMode(self):
     tokenizer = tokenizers.OpenNMTTokenizer(
         mode="none",
         sp_model_path=sp_model,
         sp_nbest_size=64,
         sp_alpha=0.1,
     )
     self._testTokenizer(tokenizer, ["appealing"], [["▁appealing"]], training=False)
    def testOpenNMTTokenizerInFunction(self):
        tokenizer = tokenizers.OpenNMTTokenizer()

        @tf.function
        def _tokenize(text):
            return tokenizer.tokenize(text)

        tokens = _tokenize(tf.constant("Hello world!"))
        self.assertAllEqual(self.evaluate(tokens), [b"Hello", b"world", b"!"])
Esempio n. 7
0
 def initialize(self, metadata, asset_dir=None, asset_prefix=""):
   self.vocabulary_file = metadata[self.vocabulary_file_key]
   self.vocabulary_size = count_lines(self.vocabulary_file) + self.num_oov_buckets
   if self.tokenizer is None:
     tokenizer_config = _get_field(metadata, "tokenization", prefix=asset_prefix)
     if tokenizer_config:
       if isinstance(tokenizer_config, six.string_types) and compat.gfile_exists(tokenizer_config):
         with compat.gfile_open(tokenizer_config, mode="rb") as config_file:
           tokenizer_config = yaml.load(config_file)
       self.tokenizer = tokenizers.OpenNMTTokenizer(params=tokenizer_config)
     else:
       self.tokenizer = tokenizers.SpaceTokenizer()
   self.tokenizer.initialize(metadata)
   return super(TextInputter, self).initialize(
       metadata, asset_dir=asset_dir, asset_prefix=asset_prefix)
Esempio n. 8
0
def main():
    tf.get_logger().setLevel("INFO")

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("data", nargs="*", help="List of data files.")
    parser.add_argument(
        "--from_vocab",
        default=None,
        help="Build from a saved vocabulary (see also --from_format).",
    )
    parser.add_argument(
        "--from_format",
        default="default",
        choices=["default", "sentencepiece"],
        help="The format of the saved vocabulary (see also --from_vocab).",
    )
    parser.add_argument("--save_vocab",
                        required=True,
                        help="Output vocabulary file.")
    parser.add_argument("--min_frequency",
                        type=int,
                        default=1,
                        help="Minimum word frequency.")
    parser.add_argument(
        "--size",
        type=int,
        default=0,
        help="Maximum vocabulary size. If = 0, do not limit vocabulary.",
    )
    parser.add_argument(
        "--size_multiple",
        type=int,
        default=1,
        help=
        ("Ensure that the vocabulary size + 1 is a multiple of this value "
         "(+ 1 represents the <unk> token that will be added during the training."
         ),
    )
    parser.add_argument(
        "--without_sequence_tokens",
        default=False,
        action="store_true",
        help=
        "If set, do not add special sequence tokens (start, end) in the vocabulary.",
    )
    parser.add_argument(
        "--tokenizer_config",
        default=None,
        help=
        ("Tokenization configuration as a JSON string or a path to a YAML configuration file. "
         "When building a SentencePiece model and vocabulary, this is used as a "
         "pre-tokenization. SentencePiece will receive tokens instead of sentences as "
         "inputs."),
    )
    parser.add_argument(
        "--sentencepiece",
        nargs="*",
        default=None,
        help=
        ("Build a SentencePiece model and vocabulary. This option accepts additional "
         "training parameters (e.g. --sentencepiece character_coverage=0.98)."
         ),
    )
    args = parser.parse_args()

    special_tokens = [constants.PADDING_TOKEN]
    if not args.without_sequence_tokens:
        special_tokens.append(constants.START_OF_SENTENCE_TOKEN)
        special_tokens.append(constants.END_OF_SENTENCE_TOKEN)

    vocab = data.Vocab(special_tokens=special_tokens)
    num_oov_buckets = 1

    if args.sentencepiece is not None:
        if args.min_frequency > 1:
            raise ValueError(
                "--min_frequency option is not supported when training a SentencePiece "
                "model and vocabulary")

        import pyonmttok

        if args.size_multiple == 1:
            vocab_size = args.size
        else:
            # Round vocabulary size to the next multiple of args.size_multiple
            vocab_size = (args.size -
                          (args.size + num_oov_buckets) % args.size_multiple +
                          args.size_multiple)

        if args.tokenizer_config:
            tokenizer = tokenizers.make_tokenizer(args.tokenizer_config)
            if not isinstance(tokenizer, tokenizers.OpenNMTTokenizer):
                tokenizer_type = tokenizer.__class__.__name__
                raise ValueError(
                    "Only tokenizer type 'OpenNMTTokenizer' can be used as a SentencePiece "
                    "pre-tokenization, got tokenizer type '%s' instead." %
                    tokenizer_type)
        else:
            tokenizer = None

        sp_params = dict(
            map(lambda arg: tuple(arg.split("=")), args.sentencepiece))
        sp_trainer = pyonmttok.SentencePieceLearner(
            tokenizer=tokenizer.opennmt_tokenizer
            if tokenizer is not None else None,
            keep_vocab=True,
            vocab_size=vocab_size,
            **sp_params,
        )

        for data_file in args.data:
            sp_trainer.ingest_file(data_file)
        sp_trainer.learn(args.save_vocab, verbose=True)

        model_path = args.save_vocab + ".model"
        vocab_path = args.save_vocab + ".vocab"

        if tokenizer is None:
            tf.get_logger().info(
                "Converting SentencePiece vocabulary to OpenNMT-tf format...")
            vocab.load(vocab_path, file_format="sentencepiece")
        else:
            tf.get_logger().info(
                "Applying SentencePiece model on data and extracting the %d most "
                "frequent tokens...",
                vocab_size,
            )
            tokenizer = tokenizers.OpenNMTTokenizer(sp_model_path=model_path,
                                                    **tokenizer.config)
            for data_file in args.data:
                vocab.add_from_text(data_file, tokenizer=tokenizer)
            vocab = vocab.prune(max_size=vocab_size)

        vocab.serialize(vocab_path)

    else:
        if args.from_vocab is not None:
            vocab.load(args.from_vocab, file_format=args.from_format)
        tokenizer = tokenizers.make_tokenizer(args.tokenizer_config)
        for data_file in args.data:
            vocab.add_from_text(data_file, tokenizer=tokenizer)
        vocab = vocab.prune(max_size=args.size,
                            min_frequency=args.min_frequency)
        vocab.pad_to_multiple(args.size_multiple,
                              num_oov_buckets=num_oov_buckets)
        vocab.serialize(args.save_vocab)
Esempio n. 9
0
 def testOpenNMTTokenizerArguments(self):
   tokenizer = tokenizers.OpenNMTTokenizer(
       mode="aggressive", spacer_annotate=True, spacer_new=True)
   self._testTokenizer(tokenizer, ["Hello World-s"], [["Hello", "▁", "World", "-", "s"]])