def build_tokenizer_by_config(self, tok_config, lang):
     if tok_config is None:
         tok_config = {"mode": "aggressive"}
         if lang == "zh":
             tok_config["segment_alphabet"] = ["Han"]
             tok_config["segment_alphabet_change"] = True
     # to avoid SentencePiece sampling
     if "sp_nbest_size" in tok_config:
         tok_config["sp_nbest_size"] = 0
     return tokenizer.build_tokenizer(tok_config)
Beispiel #2
0
 def build_tokenizer_by_config(self, tok_config, lang):
     if tok_config is None:
         tok_config = {"mode": "aggressive"}
         if lang == 'zh':
           tok_config['segment_alphabet'] = ['Han']
           tok_config['segment_alphabet_change'] = True
     # to avoid SentencePiece sampling
     if 'sp_nbest_size' in tok_config:
         tok_config['sp_nbest_size'] = 0
     return tokenizer.build_tokenizer(tok_config)
Beispiel #3
0
def _build_subword_learner(tok_config, result_dir, ref_tok_config=None):
    subword_config = tok_config.get("build_subword")
    if subword_config is None:
        return {}
    if ref_tok_config is None:
        ref_tok_config = tok_config
    subword_info = tokenizer.make_subword_learner(
        subword_config,
        result_dir,
        tokenizer=tokenizer.build_tokenizer(ref_tok_config))
    return subword_info
Beispiel #4
0
    def _build_process(self, config, side, build_state):
        # Disable subword regularization in inference.
        if self.process_type != prepoperator.ProcessType.TRAINING:
            config["bpe_dropout"] = 0
            config["sp_nbest_size"] = 0
            config["sp_alpha"] = 0

        if config.get("restrict_subword_vocabulary", False):
            vocabulary_path = build_state.get("src_vocabulary" if side ==
                                              "source" else "tgt_vocabulary")
            if vocabulary_path is None:
                raise ValueError(
                    "restrict_subword_vocabulary is set but no vocabulary is set"
                )

            # The open source Tokenizer does not accept the custom vocabulary format
            # produced by build_vocab so we create a temporary vocabulary with a simpler
            # format.
            with tempfile.NamedTemporaryFile(mode="w") as vocab_file:
                for token in tokenizer.load_vocabulary(vocabulary_path):
                    vocab_file.write("%s\n" % token)
                vocab_file.flush()
                config["vocabulary_path"] = vocab_file.name
                current_tokenizer = tokenizer.build_tokenizer(config)
        else:
            current_tokenizer = tokenizer.build_tokenizer(config)

        previous_tokenizer = None
        if build_state:
            if side == "source":
                previous_tokenizer = build_state["src_tokenizer"]
                build_state["src_tokenizer"] = current_tokenizer
            else:
                previous_tokenizer = build_state["tgt_tokenizer"]
                build_state["tgt_tokenizer"] = current_tokenizer
        if (self.process_type == prepoperator.ProcessType.POSTPROCESS
                and not self._postprocess_only):
            return previous_tokenizer
        return current_tokenizer