Python Dictionary.finalize Examples

Programming Language: Python

Namespace/Package Name: ncc.data.dictionary

Class/Type: Dictionary

Method/Function: finalize

Examples at hotexamples.com: 3

Python Dictionary.finalize - 3 examples found. These are the top rated real world Python examples of ncc.data.dictionary.Dictionary.finalize extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

load(11)

Dictionary(6)

add_token_to_dictionary(3)

finalize(3)

load_json(3)

add_file_to_dictionary(2)

add_from_file(2)

add_symbol(1)

save(1)

Example #1

Show file

    def build_dictionary(cls,
                         filenames,
                         tokenize_func,
                         workers=1,
                         threshold=-1,
                         nwords=-1,
                         padding_factor=8):
        """Build the dictionary

        Args:
            filenames (list): list of filenames
            workers (int): number of concurrent workers
            threshold (int): defines the minimum word count
            nwords (int): defines the total number of words in the final dictionary,
                including special symbols
            padding_factor (int): can be used to pad the dictionary size to be a
                multiple of 8, which is important on some hardware (e.g., Nvidia
                Tensor Cores).
        """
        d = Dictionary()

        for filename in filenames:
            Dictionary.add_token_to_dictionary(filename, d, tokenize_func,
                                               workers)

        d.finalize(threshold=threshold,
                   nwords=nwords,
                   padding_factor=padding_factor)
        return d

Example #2

Show file

File: ncc_task.py Project: CGCL-codes/naturalcc

    def build_dictionary(
        cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8,
        tokenize_func=SPACE_SPLITTER,
        **kwargs,
    ):
        """Build the dictionary

        Args:
            filenames (list): list of filenames
            workers (int): number of concurrent workers
            threshold (int): defines the minimum word count
            nwords (int): defines the total number of words in the final dictionary,
                including special symbols
            padding_factor (int): can be used to pad the dictionary size to be a
                multiple of 8, which is important on some hardware (e.g., Nvidia
                Tensor Cores).
        """
        d = Dictionary(
            pad=kwargs.get('pad', constants.PAD),
            bos=kwargs.get('bos', constants.BOS),
            eos=kwargs.get('eos', constants.EOS),
            unk=kwargs.get('unk', constants.UNK),
            extra_special_symbols=kwargs.get('extra_special_symbols', None),
        )
        for filename in filenames:
            Dictionary.add_file_to_dictionary(
                filename, d, tokenize_func, d.eos_word, workers
            )
        d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor)
        return d

Example #3

Show file

File: simple_retrieval.py Project: CGCL-codes/naturalcc

    def build_bpe_dictionary(
        cls,
        filenames,
        tokenize_func,
        workers=1,
        threshold=-1,
        nwords=-1,
        padding_factor=8,
        **special_symbols,
    ):
        """Build the dictionary

        Args:
            filenames (list): list of filenames
            workers (int): number of concurrent workers
            threshold (int): defines the minimum word count
            nwords (int): defines the total number of words in the final dictionary,
                including special symbols
            padding_factor (int): can be used to pad the dictionary size to be a
                multiple of 8, which is important on some hardware (e.g., Nvidia
                Tensor Cores).
        """
        bpe_portion = special_symbols.get('bpe_portion', 0.5)
        bpetoken_num = int(nwords * bpe_portion)
        subtoken_num = nwords - bpetoken_num
        # subtoken
        from ncc.data import constants
        subtoken_d = Dictionary(
            pad=special_symbols.get('pad', constants.PAD),
            bos=special_symbols.get('bos', constants.BOS),
            eos=special_symbols.get('eos', constants.EOS),
            unk=special_symbols.get('unk', constants.UNK),
            extra_special_symbols=special_symbols.get('extra_special_symbols',
                                                      None),
        )
        for filename in filenames:
            Dictionary.add_token_to_dictionary(filename, subtoken_d,
                                               tokenize_func, workers)
        remaining_tokens = Counter(
            {sym: c
             for sym, c in zip(subtoken_d.symbols, subtoken_d.count)})
        subtoken_d.finalize(threshold=threshold,
                            nwords=subtoken_num,
                            padding_factor=padding_factor)
        remaining_tokens = Counter({
            sym: c
            for sym, c in remaining_tokens.items() if sym not in subtoken_d
        })
        # bpetoken
        from ncc.data.retrieval.word_bpe_dictionary import WordBpeDicionary
        bpetoken_d = WordBpeDicionary()
        bpetoken_d.learn_bpe_vocab(remaining_tokens.elements(), bpetoken_num)
        bpetoken_d.finalize(threshold=0,
                            nwords=bpetoken_num,
                            padding_factor=padding_factor)
        from ncc.data.retrieval.hybrid.hybrid_retrieval_dictionary import HybridRetrievalDictionary
        return HybridRetrievalDictionary(subtoken_d, bpetoken_d)