def load_from_file(cls, vocab_type: VocabType, file, special_words: SpecialVocabWordsType) -> 'Vocab': special_words_as_unique_list = common.get_unique_list(special_words.__dict__.values()) # Notice: From historical reasons, a saved vocab doesn't include special words, # so they should be added upon loading. word_to_index_wo_specials = pickle.load(file) index_to_word_wo_specials = pickle.load(file) size_wo_specials = pickle.load(file) assert len(index_to_word_wo_specials) == len(word_to_index_wo_specials) == size_wo_specials min_word_idx_wo_specials = min(index_to_word_wo_specials.keys()) if min_word_idx_wo_specials != len(special_words_as_unique_list): raise ValueError( "Error while attempting to load vocabulary `{vocab_type}` from file `{file_path}`. " "The stored vocabulary has minimum word index {min_word_idx}, " "while expecting minimum word index to be {nr_special_words} " "because having to use {nr_special_words} special words, which are: {special_words}. " "Please check the parameter `config.SEPARATE_OOV_AND_PAD`.".format( vocab_type=vocab_type, file_path=file.name, min_word_idx=min_word_idx_wo_specials, nr_special_words=len(special_words_as_unique_list), special_words=special_words)) vocab = cls(vocab_type, [], special_words) vocab.word_to_index = {**word_to_index_wo_specials, **{word: idx for idx, word in enumerate(special_words_as_unique_list)}} vocab.index_to_word = {**index_to_word_wo_specials, **{idx: word for idx, word in enumerate(special_words_as_unique_list)}} vocab.size = size_wo_specials + len(special_words_as_unique_list) return vocab
def save_to_file(self, file): # Notice: From historical reasons, a saved vocab doesn't include special words. special_words_as_unique_list = common.get_unique_list(self.special_words.__dict__.values()) nr_special_words = len(special_words_as_unique_list) word_to_index_wo_specials = {word: idx for word, idx in self.word_to_index.items() if idx >= nr_special_words} index_to_word_wo_specials = {idx: word for idx, word in self.index_to_word.items() if idx >= nr_special_words} size_wo_specials = self.size - nr_special_words pickle.dump(word_to_index_wo_specials, file) pickle.dump(index_to_word_wo_specials, file) pickle.dump(size_wo_specials, file)
def __init__(self, vocab_type: VocabType, words: Iterable[str], special_words: Optional[SpecialVocabWordsType] = None): if special_words is None: special_words = Namespace() self.vocab_type = vocab_type self.word_to_index: Dict[str, int] = {} self.index_to_word: Dict[int, str] = {} self._word_to_index_lookup_table = None self._index_to_word_lookup_table = None self.special_words: SpecialVocabWordsType = special_words for index, word in enumerate( chain(common.get_unique_list(special_words.__dict__.values()), words)): self.word_to_index[word] = index self.index_to_word[index] = word self.size = len(self.word_to_index)