def vocab_to_dictionary(vocab: Vocabulary) -> Dictionary: """ Creates a fairseq Dictionary from a seqp Vocabulary. It manipulates the Dictionary's internal state to avoid reserving token 0 for Lua compatibility in order to respect the token ID associations in the original Vocabulary. :param vocab: Vocabulary to convert to Dictionary. :return: Resulting Dictionary. """ pad_symbol = vocab.idx2symbol[vocab.pad_id] eos_symbol = vocab.idx2symbol[vocab.eos_id] unk_symbol = vocab.idx2symbol[vocab.unk_id] dictionary = Dictionary(pad=pad_symbol, unk=unk_symbol, eos=eos_symbol) # We clear up the internal state to write it from scratch (and without # the Lua heritage token zero, to keep token IDs) dictionary.symbols = [] dictionary.count = [] dictionary.indices = {} dictionary.nspecial = 3 for symbol in vocab.idx2symbol: unknown_frequency = 1 # frequency info is not available dictionary.add_symbol(symbol, unknown_frequency) dictionary.pad_index = vocab.pad_id dictionary.eos_index = vocab.eos_id dictionary.unk_index = vocab.unk_id return dictionary
def load_target_dictionary(self): if self.cfg.labels: dict = Dictionary(bos=self.bos, pad=self.pad, eos=self.eos, unk=self.unk, from_tokenizer=True) dict.bos_index = self.tokenizer.encoder[self.bos] dict.pad_index = self.tokenizer.encoder[self.pad] dict.eos_index = self.tokenizer.encoder[self.eos] dict.unk_index = self.tokenizer.encoder[self.unk] for symbol in self.tokenizer.encoder.keys(): dict.add_symbol(symbol) return dict return None