Beispiel #1
0
def vocab_to_dictionary(vocab: Vocabulary) -> Dictionary:
    """
    Creates a fairseq Dictionary from a seqp Vocabulary. It manipulates
    the Dictionary's internal state to avoid reserving token 0 for Lua
    compatibility in order to respect the token ID associations in the
    original Vocabulary.

    :param vocab: Vocabulary to convert to Dictionary.
    :return: Resulting Dictionary.
    """
    pad_symbol = vocab.idx2symbol[vocab.pad_id]
    eos_symbol = vocab.idx2symbol[vocab.eos_id]
    unk_symbol = vocab.idx2symbol[vocab.unk_id]

    dictionary = Dictionary(pad=pad_symbol, unk=unk_symbol, eos=eos_symbol)

    # We clear up the internal state to write it from scratch (and without
    # the Lua heritage token zero, to keep token IDs)
    dictionary.symbols = []
    dictionary.count = []
    dictionary.indices = {}
    dictionary.nspecial = 3

    for symbol in vocab.idx2symbol:
        unknown_frequency = 1   # frequency info is not available
        dictionary.add_symbol(symbol, unknown_frequency)

    dictionary.pad_index = vocab.pad_id
    dictionary.eos_index = vocab.eos_id
    dictionary.unk_index = vocab.unk_id

    return dictionary
Beispiel #2
0
 def load_target_dictionary(self):
     if self.cfg.labels:
         dict = Dictionary(bos=self.bos,
                           pad=self.pad,
                           eos=self.eos,
                           unk=self.unk,
                           from_tokenizer=True)
         dict.bos_index = self.tokenizer.encoder[self.bos]
         dict.pad_index = self.tokenizer.encoder[self.pad]
         dict.eos_index = self.tokenizer.encoder[self.eos]
         dict.unk_index = self.tokenizer.encoder[self.unk]
         for symbol in self.tokenizer.encoder.keys():
             dict.add_symbol(symbol)
         return dict
     return None