def _convert_vocab(self, vocab_file, basename=None): if basename is None: basename = os.path.basename(vocab_file) converted_vocab_file = os.path.join(self._data_dir, basename) with open(converted_vocab_file, 'w') as converted_vocab: for index, token in enumerate( tokenizer.vocabulary_iterator(vocab_file)): self._map_vocab_entry(index, token, converted_vocab) return converted_vocab_file
def test_vocabulary_iterator(tmpdir): vocab_path = str(tmpdir.join("vocab.txt")) with open(vocab_path, "w") as vocab_file: vocab_file.write("# Comment 1\n") vocab_file.write("# Comment 2\n") vocab_file.write("\n") vocab_file.write("hello\n") vocab_file.write("world 42\n") vocab_file.write("toto 0.0224656\n") vocab_file.write("titi 2.8989e-08\n") vocab_file.write("hello world\n") # Bad token with a space. tokens = list(tokenizer.vocabulary_iterator(vocab_path)) assert tokens == ["", "hello", "world", "toto", "titi", "hello world"]
def _build_process(self, config, side, build_state): # Disable subword regularization in inference. if self.process_type != prepoperator.ProcessType.TRAINING: config["bpe_dropout"] = 0 config["sp_nbest_size"] = 0 config["sp_alpha"] = 0 if config.get("restrict_subword_vocabulary", False): vocabulary_path = build_state.get( "src_vocabulary" if side == "source" else "tgt_vocabulary") if vocabulary_path is None: raise ValueError("restrict_subword_vocabulary is set but no vocabulary is set") # The open source Tokenizer does not accept the custom vocabulary format # produced by build_vocab so we create a temporary vocabulary with a simpler # format. with tempfile.NamedTemporaryFile(mode="w") as vocab_file: for token in tokenizer.vocabulary_iterator(vocabulary_path): vocab_file.write("%s\n" % token) vocab_file.flush() config["vocabulary_path"] = vocab_file.name current_tokenizer = tokenizer.build_tokenizer(config) else: current_tokenizer = tokenizer.build_tokenizer(config) previous_tokenizer = None if build_state: if side == "source": previous_tokenizer = build_state["src_tokenizer"] build_state["src_tokenizer"] = current_tokenizer else: previous_tokenizer = build_state["tgt_tokenizer"] build_state["tgt_tokenizer"] = current_tokenizer if self.process_type == prepoperator.ProcessType.POSTPROCESS and not self._postprocess_only: return previous_tokenizer return current_tokenizer
def finalize(self): config = self._config if not self._source_counters and not self._target_counters: return tok_config = config["preprocess"][self._tok_step] if self._source_counters is self._target_counters: vocabularies = [("multi", self._source_counters)] else: vocabularies = [] if self._source_counters: vocabularies.append(("source", self._source_counters)) if self._target_counters: vocabularies.append(("target", self._target_counters)) for side, counters in vocabularies: vocabulary = counters["tokens"] total_size = counters["total"] name = (tok_config[side]["build_vocabulary"]["name"] if "name" in tok_config[side]["build_vocabulary"] else "vocab" + str(self._tok_step)) logger.info("Generating %s vocabulary '%s'", side, name) # Size option is mandatory, already checked it. size = tok_config[side]["build_vocabulary"]["size"] min_frequency = ( tok_config[side]["build_vocabulary"]["min-frequency"] if "min-frequency" in tok_config[side]["build_vocabulary"] else 0) added_size = 0 # Merge previously created vocabulary. vocab_to_merge = (tok_config[side]["build_vocabulary"]["merge"] if "merge" in tok_config[side]["build_vocabulary"] else None) if vocab_to_merge and os.path.isfile(vocab_to_merge): for w in tokenizer.vocabulary_iterator(vocab_to_merge): if w: # Set heaviest frequency on tokens from vocabulary to merge. vocabulary[w] = float("inf") added_size += 1 # Add extra tokens from a list. vocab_to_add = (tok_config[side]["build_vocabulary"]["add"] if "add" in tok_config[side]["build_vocabulary"] else []) for w in vocab_to_add: vocabulary[w] = float("inf") added_size += 1 if added_size > size: raise RuntimeError( "The size of extra tokens from 'merge' and 'add' (%d) cannot be bigger than than the required vocabulary size (%d)" % (added_size, size)) # Add tokens added by operators, such as extra numbered placeholders that might not all be present in the sampled data. new_tokens = self._tokens_to_add.new_tokens if side == "multi": tokens_to_add = set().union(*new_tokens.values()) else: tokens_to_add = new_tokens[side] for ph in tokens_to_add: vocabulary[ph] = float("inf") # First add placeholders to vocabulary. sorted_vocabulary = [ item for item in vocabulary.items() if pyonmttok.is_placeholder(item[0]) ] # Then add everything else in frequency order. sorted_vocabulary.extend( sorted( [ item for item in vocabulary.items() if not pyonmttok.is_placeholder(item[0]) ], key=lambda k_v: k_v[1], reverse=True, )) # Find out the real vocabulary size. real_size = self._prune(sorted_vocabulary, size, min_frequency) # Write to file. if side == "multi": out_file = os.path.join( self._result_dir, "joint_vocab_%s-%d.%s_%s" % (name, real_size, config["source"], config["target"]), ) tok_config["source"]["vocabulary_path"] = out_file tok_config["target"]["vocabulary_path"] = out_file else: out_file = os.path.join( self._result_dir, "vocab_%s-%d.%s" % (name, real_size, config[side])) tok_config[side]["vocabulary_path"] = out_file with open(out_file, "w") as vocab_file: # Add header with configuration vocab_file.write("# Generated by buildvocab\n") vocab_file.write("# CONFIG: {} \n".format(self._config)) for i in range(real_size): w, f = sorted_vocabulary[i] vocab_file.write("%s %s\n" % (w, f / float(total_size)))
def test_op_adding_tokens(tmpdir): @prepoperator.register_operator("op_adding_tokens") class OpAddingTokens(prepoperator.TUOperator): def _preprocess_tu(self, tu, meta_batch): meta_batch["tokens_to_add"] = { "source": ["a", "b", "b"], "target": ["c"] } return [tu] corpus_dir = tmpdir.join("corpus") corpus_dir.mkdir() generate_pseudo_corpus(corpus_dir, 100, "generic_corpus", "en") generate_pseudo_corpus(corpus_dir, 100, "generic_corpus", "de") config = { "source": "en", "target": "de", "data": { "sample": 0, "sample_dist": [{ "path": str(corpus_dir), "distribution": [ ["generic", "*"], ], }], }, "preprocess": [ { "op": "op_adding_tokens", }, { "op": "tokenization", "source": { "mode": "space", "build_vocabulary": { "size": 20 } }, "target": { "mode": "space", "build_vocabulary": { "size": 20 } }, }, ], } preprocessor = TrainingProcessor(config, "", str(tmpdir)) _, _, vocab_config = preprocessor.generate_vocabularies() source_vocabulary = set(vocabulary_iterator( vocab_config["source"]["path"])) target_vocabulary = set(vocabulary_iterator( vocab_config["target"]["path"])) assert source_vocabulary > set(["a", "b"]) assert target_vocabulary > set(["c"]) _, _, _, _, tokens_to_add = preprocessor.generate_preprocessed_data() assert set(tokens_to_add["source"]) == set(["a", "b"]) assert set(tokens_to_add["target"]) == set(["c"])