def build_dictionary(cls,
                      filenames,
                      workers=1,
                      threshold=-1,
                      nwords=-1,
                      padding_factor=8):
     d = BertDictionary()
     for filename in filenames:
         Dictionary.add_file_to_dictionary(filename, d,
                                           tokenizer.tokenize_line, workers)
     d.finalize(threshold=threshold,
                nwords=nwords,
                padding_factor=padding_factor)
     return d
Esempio n. 2
0
 def from_config(cls, config: Config, **kwargs):
     tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer)
     replacements = {
         config.unk_token: UNK,
         config.pad_token: PAD,
         config.bos_token: BOS,
         config.eos_token: EOS,
         config.mask_token: MASK,
     }
     if isinstance(tokenizer, WordPieceTokenizer):
         vocab = Vocabulary(
             [token for token, _ in tokenizer.vocab.items()],
             replacements=replacements,
         )
     else:
         dictionary = BertDictionary.load(config.vocab_file)
         vocab = Vocabulary(
             dictionary.symbols, dictionary.count, replacements=replacements
         )
     return cls(
         columns=config.columns,
         tokenizer=tokenizer,
         add_bos_token=config.add_bos_token,
         add_eos_token=config.add_eos_token,
         use_eos_token_for_bos=config.use_eos_token_for_bos,
         max_seq_len=config.max_seq_len,
         vocab=vocab,
         **kwargs,
     )
Esempio n. 3
0
    def setup_task(cls, args, **kwargs):
        """Setup the task.
        """
        paths = args.data.split(':')
        assert len(paths) > 0
        dictionary = BertDictionary.load(os.path.join(paths[0], 'dict.txt'))
        print('| dictionary: {} types'.format(len(dictionary)))

        return cls(args, dictionary)
Esempio n. 4
0
 def load_dictionary(cls, filename):
     return BertDictionary.load(filename)