Beispiel #1
0
def test_is_placeholder():
    assert not pyonmttok.is_placeholder("hello")
    assert pyonmttok.is_placeholder("⦅hello⦆")
Beispiel #2
0
    def finalize(self):
        config = self._config
        if not self._source_counters and not self._target_counters:
            return

        tok_config = config["preprocess"][self._tok_step]

        if self._source_counters is self._target_counters:
            vocabularies = [("multi", self._source_counters)]
        else:
            vocabularies = []
            if self._source_counters:
                vocabularies.append(("source", self._source_counters))
            if self._target_counters:
                vocabularies.append(("target", self._target_counters))

        for side, counters in vocabularies:
            vocabulary = counters["tokens"]

            total_size = counters["total"]
            name = (tok_config[side]["build_vocabulary"]["name"] if "name"
                    in tok_config[side]["build_vocabulary"] else "vocab" +
                    str(self._tok_step))

            logger.info("Generating %s vocabulary '%s'", side, name)

            # Size option is mandatory, already checked it.
            size = tok_config[side]["build_vocabulary"]["size"]

            min_frequency = (
                tok_config[side]["build_vocabulary"]["min-frequency"] if
                "min-frequency" in tok_config[side]["build_vocabulary"] else 0)

            added_size = 0

            # Merge previously created vocabulary.
            vocab_to_merge = (tok_config[side]["build_vocabulary"]["merge"] if
                              "merge" in tok_config[side]["build_vocabulary"]
                              else None)

            if vocab_to_merge and os.path.isfile(vocab_to_merge):
                for w in tokenizer.vocabulary_iterator(vocab_to_merge):
                    if w:
                        # Set heaviest frequency on tokens from vocabulary to merge.
                        vocabulary[w] = float("inf")
                        added_size += 1

            # Add extra tokens from a list.
            vocab_to_add = (tok_config[side]["build_vocabulary"]["add"]
                            if "add" in tok_config[side]["build_vocabulary"]
                            else [])

            for w in vocab_to_add:
                vocabulary[w] = float("inf")
                added_size += 1

            if added_size > size:
                raise RuntimeError(
                    "The size of extra tokens from 'merge' and 'add' (%d) cannot be bigger than than the required vocabulary size (%d)"
                    % (added_size, size))

            # Add tokens added by operators, such as extra numbered placeholders that might not all be present in the sampled data.
            new_tokens = self._tokens_to_add.new_tokens
            if side == "multi":
                tokens_to_add = set().union(*new_tokens.values())
            else:
                tokens_to_add = new_tokens[side]

            for ph in tokens_to_add:
                vocabulary[ph] = float("inf")

            # First add placeholders to vocabulary.
            sorted_vocabulary = [
                item for item in vocabulary.items()
                if pyonmttok.is_placeholder(item[0])
            ]

            # Then add everything else in frequency order.
            sorted_vocabulary.extend(
                sorted(
                    [
                        item for item in vocabulary.items()
                        if not pyonmttok.is_placeholder(item[0])
                    ],
                    key=lambda k_v: k_v[1],
                    reverse=True,
                ))

            # Find out the real vocabulary size.
            real_size = self._prune(sorted_vocabulary, size, min_frequency)

            # Write to file.
            if side == "multi":
                out_file = os.path.join(
                    self._result_dir,
                    "joint_vocab_%s-%d.%s_%s" %
                    (name, real_size, config["source"], config["target"]),
                )
                tok_config["source"]["vocabulary_path"] = out_file
                tok_config["target"]["vocabulary_path"] = out_file

            else:
                out_file = os.path.join(
                    self._result_dir,
                    "vocab_%s-%d.%s" % (name, real_size, config[side]))
                tok_config[side]["vocabulary_path"] = out_file

            with open(out_file, "w") as vocab_file:
                # Add header with configuration
                vocab_file.write("# Generated by buildvocab\n")
                vocab_file.write("# CONFIG: {} \n".format(self._config))
                for i in range(real_size):
                    w, f = sorted_vocabulary[i]
                    vocab_file.write("%s %s\n" % (w, f / float(total_size)))