Esempio n. 1
0
def _get_vocabulary_mapping(current_vocab_path, new_vocab_path, mode):
    """Maps vocabulary new indices to old ones. -1 means that the entry is new."""
    current_vocab = Vocab(from_file=current_vocab_path)
    new_vocab = Vocab(from_file=new_vocab_path)
    mapping = []
    if mode == "merge":
        mapping = [i for i in range(current_vocab.size)]
        for new_word in new_vocab.words:
            if current_vocab.lookup(new_word) is None:
                mapping.append(-1)
    elif mode == "replace":
        for new_word in new_vocab.words:
            idx = current_vocab.lookup(new_word)
            if idx is not None:
                mapping.append(idx)
            else:
                mapping.append(-1)
    mapping.append(current_vocab.size)  # <unk> token is always the last entry.
    return mapping
Esempio n. 2
0
 def _saveVocab(self, name, words):
     vocab = Vocab()
     for word in words:
         vocab.add(str(word))
     vocab_file = os.path.join(self.get_temp_dir(), name)
     vocab.serialize(vocab_file)
     return vocab_file
Esempio n. 3
0
def _make_vocab_from_file(path, data_file):
    vocab = Vocab(special_tokens=[
        constants.PADDING_TOKEN, constants.START_OF_SENTENCE_TOKEN,
        constants.END_OF_SENTENCE_TOKEN
    ])
    vocab.add_from_text(data_file)
    vocab.serialize(path)
    return path