Example #1
0
    def build_dictionary(cls,
                         filenames,
                         tokenize_func,
                         workers=1,
                         threshold=-1,
                         nwords=-1,
                         padding_factor=8):
        """Build the dictionary

        Args:
            filenames (list): list of filenames
            workers (int): number of concurrent workers
            threshold (int): defines the minimum word count
            nwords (int): defines the total number of words in the final dictionary,
                including special symbols
            padding_factor (int): can be used to pad the dictionary size to be a
                multiple of 8, which is important on some hardware (e.g., Nvidia
                Tensor Cores).
        """
        d = Dictionary()

        for filename in filenames:
            Dictionary.add_token_to_dictionary(filename, d, tokenize_func,
                                               workers)

        d.finalize(threshold=threshold,
                   nwords=nwords,
                   padding_factor=padding_factor)
        return d
Example #2
0
    def build_dictionary(
        cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8,
        tokenize_func=SPACE_SPLITTER,
        **kwargs,
    ):
        """Build the dictionary

        Args:
            filenames (list): list of filenames
            workers (int): number of concurrent workers
            threshold (int): defines the minimum word count
            nwords (int): defines the total number of words in the final dictionary,
                including special symbols
            padding_factor (int): can be used to pad the dictionary size to be a
                multiple of 8, which is important on some hardware (e.g., Nvidia
                Tensor Cores).
        """
        d = Dictionary(
            pad=kwargs.get('pad', constants.PAD),
            bos=kwargs.get('bos', constants.BOS),
            eos=kwargs.get('eos', constants.EOS),
            unk=kwargs.get('unk', constants.UNK),
            extra_special_symbols=kwargs.get('extra_special_symbols', None),
        )
        for filename in filenames:
            Dictionary.add_file_to_dictionary(
                filename, d, tokenize_func, d.eos_word, workers
            )
        d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor)
        return d
Example #3
0
    def build_bpe_dictionary(
        cls,
        filenames,
        tokenize_func,
        workers=1,
        threshold=-1,
        nwords=-1,
        padding_factor=8,
        **special_symbols,
    ):
        """Build the dictionary

        Args:
            filenames (list): list of filenames
            workers (int): number of concurrent workers
            threshold (int): defines the minimum word count
            nwords (int): defines the total number of words in the final dictionary,
                including special symbols
            padding_factor (int): can be used to pad the dictionary size to be a
                multiple of 8, which is important on some hardware (e.g., Nvidia
                Tensor Cores).
        """
        bpe_portion = special_symbols.get('bpe_portion', 0.5)
        bpetoken_num = int(nwords * bpe_portion)
        subtoken_num = nwords - bpetoken_num
        # subtoken
        from ncc.data import constants
        subtoken_d = Dictionary(
            pad=special_symbols.get('pad', constants.PAD),
            bos=special_symbols.get('bos', constants.BOS),
            eos=special_symbols.get('eos', constants.EOS),
            unk=special_symbols.get('unk', constants.UNK),
            extra_special_symbols=special_symbols.get('extra_special_symbols',
                                                      None),
        )
        for filename in filenames:
            Dictionary.add_token_to_dictionary(filename, subtoken_d,
                                               tokenize_func, workers)
        remaining_tokens = Counter(
            {sym: c
             for sym, c in zip(subtoken_d.symbols, subtoken_d.count)})
        subtoken_d.finalize(threshold=threshold,
                            nwords=subtoken_num,
                            padding_factor=padding_factor)
        remaining_tokens = Counter({
            sym: c
            for sym, c in remaining_tokens.items() if sym not in subtoken_d
        })
        # bpetoken
        from ncc.data.retrieval.word_bpe_dictionary import WordBpeDicionary
        bpetoken_d = WordBpeDicionary()
        bpetoken_d.learn_bpe_vocab(remaining_tokens.elements(), bpetoken_num)
        bpetoken_d.finalize(threshold=0,
                            nwords=bpetoken_num,
                            padding_factor=padding_factor)
        from ncc.data.retrieval.hybrid.hybrid_retrieval_dictionary import HybridRetrievalDictionary
        return HybridRetrievalDictionary(subtoken_d, bpetoken_d)