def _convert_vocab(self, vocab_file, basename=None):
     if basename is None:
         basename = os.path.basename(vocab_file)
     converted_vocab_file = os.path.join(self._data_dir, basename)
     with open(converted_vocab_file, 'w') as converted_vocab:
         for index, token in enumerate(
                 tokenizer.vocabulary_iterator(vocab_file)):
             self._map_vocab_entry(index, token, converted_vocab)
     return converted_vocab_file
def test_vocabulary_iterator(tmpdir):
    vocab_path = str(tmpdir.join("vocab.txt"))
    with open(vocab_path, "w") as vocab_file:
        vocab_file.write("# Comment 1\n")
        vocab_file.write("# Comment 2\n")
        vocab_file.write("\n")
        vocab_file.write("hello\n")
        vocab_file.write("world 42\n")
        vocab_file.write("toto 0.0224656\n")
        vocab_file.write("titi 2.8989e-08\n")
        vocab_file.write("hello world\n")  # Bad token with a space.

    tokens = list(tokenizer.vocabulary_iterator(vocab_path))
    assert tokens == ["", "hello", "world", "toto", "titi", "hello world"]
    def _build_process(self, config, side, build_state):
        # Disable subword regularization in inference.
        if self.process_type != prepoperator.ProcessType.TRAINING:
            config["bpe_dropout"] = 0
            config["sp_nbest_size"] = 0
            config["sp_alpha"] = 0

        if config.get("restrict_subword_vocabulary", False):
            vocabulary_path = build_state.get(
                "src_vocabulary" if side == "source" else "tgt_vocabulary")
            if vocabulary_path is None:
                raise ValueError("restrict_subword_vocabulary is set but no vocabulary is set")

            # The open source Tokenizer does not accept the custom vocabulary format
            # produced by build_vocab so we create a temporary vocabulary with a simpler
            # format.
            with tempfile.NamedTemporaryFile(mode="w") as vocab_file:
                for token in tokenizer.vocabulary_iterator(vocabulary_path):
                    vocab_file.write("%s\n" % token)
                vocab_file.flush()
                config["vocabulary_path"] = vocab_file.name
                current_tokenizer = tokenizer.build_tokenizer(config)
        else:
            current_tokenizer = tokenizer.build_tokenizer(config)

        previous_tokenizer = None
        if build_state:
            if side == "source":
                previous_tokenizer = build_state["src_tokenizer"]
                build_state["src_tokenizer"] = current_tokenizer
            else:
                previous_tokenizer = build_state["tgt_tokenizer"]
                build_state["tgt_tokenizer"] = current_tokenizer
        if self.process_type == prepoperator.ProcessType.POSTPROCESS and not self._postprocess_only:
            return previous_tokenizer
        return current_tokenizer
Exemple #4
0
    def finalize(self):
        config = self._config
        if not self._source_counters and not self._target_counters:
            return

        tok_config = config["preprocess"][self._tok_step]

        if self._source_counters is self._target_counters:
            vocabularies = [("multi", self._source_counters)]
        else:
            vocabularies = []
            if self._source_counters:
                vocabularies.append(("source", self._source_counters))
            if self._target_counters:
                vocabularies.append(("target", self._target_counters))

        for side, counters in vocabularies:
            vocabulary = counters["tokens"]

            total_size = counters["total"]
            name = (tok_config[side]["build_vocabulary"]["name"] if "name"
                    in tok_config[side]["build_vocabulary"] else "vocab" +
                    str(self._tok_step))

            logger.info("Generating %s vocabulary '%s'", side, name)

            # Size option is mandatory, already checked it.
            size = tok_config[side]["build_vocabulary"]["size"]

            min_frequency = (
                tok_config[side]["build_vocabulary"]["min-frequency"] if
                "min-frequency" in tok_config[side]["build_vocabulary"] else 0)

            added_size = 0

            # Merge previously created vocabulary.
            vocab_to_merge = (tok_config[side]["build_vocabulary"]["merge"] if
                              "merge" in tok_config[side]["build_vocabulary"]
                              else None)

            if vocab_to_merge and os.path.isfile(vocab_to_merge):
                for w in tokenizer.vocabulary_iterator(vocab_to_merge):
                    if w:
                        # Set heaviest frequency on tokens from vocabulary to merge.
                        vocabulary[w] = float("inf")
                        added_size += 1

            # Add extra tokens from a list.
            vocab_to_add = (tok_config[side]["build_vocabulary"]["add"]
                            if "add" in tok_config[side]["build_vocabulary"]
                            else [])

            for w in vocab_to_add:
                vocabulary[w] = float("inf")
                added_size += 1

            if added_size > size:
                raise RuntimeError(
                    "The size of extra tokens from 'merge' and 'add' (%d) cannot be bigger than than the required vocabulary size (%d)"
                    % (added_size, size))

            # Add tokens added by operators, such as extra numbered placeholders that might not all be present in the sampled data.
            new_tokens = self._tokens_to_add.new_tokens
            if side == "multi":
                tokens_to_add = set().union(*new_tokens.values())
            else:
                tokens_to_add = new_tokens[side]

            for ph in tokens_to_add:
                vocabulary[ph] = float("inf")

            # First add placeholders to vocabulary.
            sorted_vocabulary = [
                item for item in vocabulary.items()
                if pyonmttok.is_placeholder(item[0])
            ]

            # Then add everything else in frequency order.
            sorted_vocabulary.extend(
                sorted(
                    [
                        item for item in vocabulary.items()
                        if not pyonmttok.is_placeholder(item[0])
                    ],
                    key=lambda k_v: k_v[1],
                    reverse=True,
                ))

            # Find out the real vocabulary size.
            real_size = self._prune(sorted_vocabulary, size, min_frequency)

            # Write to file.
            if side == "multi":
                out_file = os.path.join(
                    self._result_dir,
                    "joint_vocab_%s-%d.%s_%s" %
                    (name, real_size, config["source"], config["target"]),
                )
                tok_config["source"]["vocabulary_path"] = out_file
                tok_config["target"]["vocabulary_path"] = out_file

            else:
                out_file = os.path.join(
                    self._result_dir,
                    "vocab_%s-%d.%s" % (name, real_size, config[side]))
                tok_config[side]["vocabulary_path"] = out_file

            with open(out_file, "w") as vocab_file:
                # Add header with configuration
                vocab_file.write("# Generated by buildvocab\n")
                vocab_file.write("# CONFIG: {} \n".format(self._config))
                for i in range(real_size):
                    w, f = sorted_vocabulary[i]
                    vocab_file.write("%s %s\n" % (w, f / float(total_size)))
Exemple #5
0
def test_op_adding_tokens(tmpdir):
    @prepoperator.register_operator("op_adding_tokens")
    class OpAddingTokens(prepoperator.TUOperator):
        def _preprocess_tu(self, tu, meta_batch):
            meta_batch["tokens_to_add"] = {
                "source": ["a", "b", "b"],
                "target": ["c"]
            }
            return [tu]

    corpus_dir = tmpdir.join("corpus")
    corpus_dir.mkdir()

    generate_pseudo_corpus(corpus_dir, 100, "generic_corpus", "en")
    generate_pseudo_corpus(corpus_dir, 100, "generic_corpus", "de")

    config = {
        "source":
        "en",
        "target":
        "de",
        "data": {
            "sample":
            0,
            "sample_dist": [{
                "path": str(corpus_dir),
                "distribution": [
                    ["generic", "*"],
                ],
            }],
        },
        "preprocess": [
            {
                "op": "op_adding_tokens",
            },
            {
                "op": "tokenization",
                "source": {
                    "mode": "space",
                    "build_vocabulary": {
                        "size": 20
                    }
                },
                "target": {
                    "mode": "space",
                    "build_vocabulary": {
                        "size": 20
                    }
                },
            },
        ],
    }

    preprocessor = TrainingProcessor(config, "", str(tmpdir))
    _, _, vocab_config = preprocessor.generate_vocabularies()
    source_vocabulary = set(vocabulary_iterator(
        vocab_config["source"]["path"]))
    target_vocabulary = set(vocabulary_iterator(
        vocab_config["target"]["path"]))
    assert source_vocabulary > set(["a", "b"])
    assert target_vocabulary > set(["c"])

    _, _, _, _, tokens_to_add = preprocessor.generate_preprocessed_data()
    assert set(tokens_to_add["source"]) == set(["a", "b"])
    assert set(tokens_to_add["target"]) == set(["c"])