def _get_vocabulary_mapping(current_vocab_path, new_vocab_path, mode): """Maps vocabulary new indices to old ones. -1 means that the entry is new.""" current_vocab = Vocab(from_file=current_vocab_path) new_vocab = Vocab(from_file=new_vocab_path) mapping = [] if mode == "merge": mapping = [i for i in range(current_vocab.size)] for new_word in new_vocab.words: if current_vocab.lookup(new_word) is None: mapping.append(-1) elif mode == "replace": for new_word in new_vocab.words: idx = current_vocab.lookup(new_word) if idx is not None: mapping.append(idx) else: mapping.append(-1) mapping.append(current_vocab.size) # <unk> token is always the last entry. return mapping
def _saveVocab(self, name, words): vocab = Vocab() for word in words: vocab.add(str(word)) vocab_file = os.path.join(self.get_temp_dir(), name) vocab.serialize(vocab_file) return vocab_file
def _make_vocab_from_file(path, data_file): vocab = Vocab(special_tokens=[ constants.PADDING_TOKEN, constants.START_OF_SENTENCE_TOKEN, constants.END_OF_SENTENCE_TOKEN ]) vocab.add_from_text(data_file) vocab.serialize(path) return path