Exemple #1
0
    def load_from_file(cls, vocab_type: VocabType, file, special_words: SpecialVocabWordsType) -> 'Vocab':
        special_words_as_unique_list = common.get_unique_list(special_words.__dict__.values())

        # Notice: From historical reasons, a saved vocab doesn't include special words,
        #         so they should be added upon loading.

        word_to_index_wo_specials = pickle.load(file)
        index_to_word_wo_specials = pickle.load(file)
        size_wo_specials = pickle.load(file)
        assert len(index_to_word_wo_specials) == len(word_to_index_wo_specials) == size_wo_specials
        min_word_idx_wo_specials = min(index_to_word_wo_specials.keys())

        if min_word_idx_wo_specials != len(special_words_as_unique_list):
            raise ValueError(
                "Error while attempting to load vocabulary `{vocab_type}` from file `{file_path}`. "
                "The stored vocabulary has minimum word index {min_word_idx}, "
                "while expecting minimum word index to be {nr_special_words} "
                "because having to use {nr_special_words} special words, which are: {special_words}. "
                "Please check the parameter `config.SEPARATE_OOV_AND_PAD`.".format(
                    vocab_type=vocab_type, file_path=file.name, min_word_idx=min_word_idx_wo_specials,
                    nr_special_words=len(special_words_as_unique_list), special_words=special_words))

        vocab = cls(vocab_type, [], special_words)
        vocab.word_to_index = {**word_to_index_wo_specials,
                               **{word: idx for idx, word in enumerate(special_words_as_unique_list)}}
        vocab.index_to_word = {**index_to_word_wo_specials,
                               **{idx: word for idx, word in enumerate(special_words_as_unique_list)}}
        vocab.size = size_wo_specials + len(special_words_as_unique_list)
        return vocab
Exemple #2
0
 def save_to_file(self, file):
     # Notice: From historical reasons, a saved vocab doesn't include special words.
     special_words_as_unique_list = common.get_unique_list(self.special_words.__dict__.values())
     nr_special_words = len(special_words_as_unique_list)
     word_to_index_wo_specials = {word: idx for word, idx in self.word_to_index.items() if idx >= nr_special_words}
     index_to_word_wo_specials = {idx: word for idx, word in self.index_to_word.items() if idx >= nr_special_words}
     size_wo_specials = self.size - nr_special_words
     pickle.dump(word_to_index_wo_specials, file)
     pickle.dump(index_to_word_wo_specials, file)
     pickle.dump(size_wo_specials, file)
Exemple #3
0
    def __init__(self,
                 vocab_type: VocabType,
                 words: Iterable[str],
                 special_words: Optional[SpecialVocabWordsType] = None):
        if special_words is None:
            special_words = Namespace()

        self.vocab_type = vocab_type
        self.word_to_index: Dict[str, int] = {}
        self.index_to_word: Dict[int, str] = {}
        self._word_to_index_lookup_table = None
        self._index_to_word_lookup_table = None
        self.special_words: SpecialVocabWordsType = special_words

        for index, word in enumerate(
                chain(common.get_unique_list(special_words.__dict__.values()),
                      words)):
            self.word_to_index[word] = index
            self.index_to_word[index] = word

        self.size = len(self.word_to_index)