def build_dictionary(cls, filenames, tokenize_func, workers=1, threshold=-1, nwords=-1, padding_factor=8): """Build the dictionary Args: filenames (list): list of filenames workers (int): number of concurrent workers threshold (int): defines the minimum word count nwords (int): defines the total number of words in the final dictionary, including special symbols padding_factor (int): can be used to pad the dictionary size to be a multiple of 8, which is important on some hardware (e.g., Nvidia Tensor Cores). """ d = Dictionary() for filename in filenames: Dictionary.add_token_to_dictionary(filename, d, tokenize_func, workers) d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor) return d
def build_bpe_dictionary( cls, filenames, tokenize_func, workers=1, threshold=-1, nwords=-1, padding_factor=8, **special_symbols, ): """Build the dictionary Args: filenames (list): list of filenames workers (int): number of concurrent workers threshold (int): defines the minimum word count nwords (int): defines the total number of words in the final dictionary, including special symbols padding_factor (int): can be used to pad the dictionary size to be a multiple of 8, which is important on some hardware (e.g., Nvidia Tensor Cores). """ bpe_portion = special_symbols.get('bpe_portion', 0.5) bpetoken_num = int(nwords * bpe_portion) subtoken_num = nwords - bpetoken_num # subtoken from ncc.data import constants subtoken_d = Dictionary( pad=special_symbols.get('pad', constants.PAD), bos=special_symbols.get('bos', constants.BOS), eos=special_symbols.get('eos', constants.EOS), unk=special_symbols.get('unk', constants.UNK), extra_special_symbols=special_symbols.get('extra_special_symbols', None), ) for filename in filenames: Dictionary.add_token_to_dictionary(filename, subtoken_d, tokenize_func, workers) remaining_tokens = Counter( {sym: c for sym, c in zip(subtoken_d.symbols, subtoken_d.count)}) subtoken_d.finalize(threshold=threshold, nwords=subtoken_num, padding_factor=padding_factor) remaining_tokens = Counter({ sym: c for sym, c in remaining_tokens.items() if sym not in subtoken_d }) # bpetoken from ncc.data.retrieval.word_bpe_dictionary import WordBpeDicionary bpetoken_d = WordBpeDicionary() bpetoken_d.learn_bpe_vocab(remaining_tokens.elements(), bpetoken_num) bpetoken_d.finalize(threshold=0, nwords=bpetoken_num, padding_factor=padding_factor) from ncc.data.retrieval.hybrid.hybrid_retrieval_dictionary import HybridRetrievalDictionary return HybridRetrievalDictionary(subtoken_d, bpetoken_d)
def build_dictionary( cls, filenames, tokenize_func, workers=1, threshold=-1, nwords=-1, padding_factor=8, **special_symbols, ): """Build the dictionary Args: filenames (list): list of filenames workers (int): number of concurrent workers threshold (int): defines the minimum word count nwords (int): defines the total number of words in the final dictionary, including special symbols padding_factor (int): can be used to pad the dictionary size to be a multiple of 8, which is important on some hardware (e.g., Nvidia Tensor Cores). """ from ncc.data import constants d = Dictionary( pad=special_symbols.get('pad', constants.PAD), bos=special_symbols.get('bos', constants.BOS), eos=special_symbols.get('eos', constants.EOS), unk=special_symbols.get('unk', constants.UNK), extra_special_symbols=special_symbols.get('extra_special_symbols', None), ) for filename in filenames: Dictionary.add_token_to_dictionary(filename, d, tokenize_func, workers) d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor) return d