def main() -> None:
    parser = argparse.ArgumentParser(
        description="Build vocabulary from corpus data.")
    parser.add_argument(
        "--corpus-data",
        type=str,
        required=True,
        help=
        "The path pattern (glob) to all tokenized corpus files (train, test, val)."
    )
    parser.add_argument("--langs",
                        type=str,
                        required=True,
                        help="The pre-trained model languages.")
    parser.add_argument("--output",
                        type=str,
                        required=True,
                        help="The vocabulary file.")
    args = parser.parse_args()

    langs = args.langs.split(",")
    ft_dict = Dictionary()
    for data_path in glob(args.corpus_data):
        Dictionary.add_file_to_dictionary(data_path, ft_dict, tokenize_line, 4)
    ft_dict.finalize(padding_factor=0)
    pad_dict(ft_dict, len(langs) + 1)
    ft_dict.save(args.output)
    def build_dictionary(cls,
                         filenames,
                         workers=1,
                         threshold=-1,
                         nwords=-1,
                         padding_factor=8):
        """Build the dictionary

        Args:
            filenames (list): list of filenames
            workers (int): number of concurrent workers
            threshold (int): defines the minimum word count
            nwords (int): defines the total number of words in the final dictionary,
                including special symbols
            padding_factor (int): can be used to pad the dictionary size to be a
                multiple of 8, which is important on some hardware (e.g., Nvidia
                Tensor Cores).
        """
        d = Dictionary()
        for filename in filenames:
            Dictionary.add_file_to_dictionary(filename, d,
                                              tokenizer.tokenize_line, workers)
        d.finalize(threshold=threshold,
                   nwords=nwords,
                   padding_factor=padding_factor)
        return d
 def build_dictionary(cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8):
     print(f'Building BERT Dictionary')
     d = BertDictionary()
     for filename in filenames:
         Dictionary.add_file_to_dictionary(filename, d, tokenizer.tokenize_line, workers)
     d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor)
     return d
 def build_dict(cls, filenames, word_level=False, workers=1, threshold=-1, nwords=-1, padding_factor=8):
     d = Dictionary()
     for filename in filenames:
         Dictionary.add_file_to_dictionary(filename,
                                           d,
                                           tokenize_line_word if word_level else tokenize_line_char,
                                           workers)
     d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor)
     return d
Example #5
0
    def setup_task(cls, args, **kwargs):
        """Setup the task (e.g., load dictionaries).

        Args:
            args (argparse.Namespace): parsed command-line arguments
        """
        args.left_pad_source = options.eval_bool(args.left_pad_source)
        args.left_pad_target = options.eval_bool(args.left_pad_target)
        if getattr(args, 'raw_text', False):
            utils.deprecation_warning(
                '--raw-text is deprecated, please use --dataset-impl=raw')
            args.dataset_impl = 'raw'
        elif getattr(args, 'lazy_load', False):
            utils.deprecation_warning(
                '--lazy-load is deprecated, please use --dataset-impl=lazy')
            args.dataset_impl = 'lazy'

        paths = args.data.split(':')
        assert len(paths) > 0
        # find language pair automatically
        if args.source_lang is None or args.target_lang is None:
            raise Exception(
                'Could not infer language pair, please provide it explicitly')

        # load dictionaries
        src_dict = cls.load_dictionary(
            os.path.join(paths[0], 'dict.{}.txt'.format(args.source_lang)))
        tgt_dict = cls.load_dictionary(
            os.path.join(paths[0], 'dict.{}.txt'.format(args.target_lang)))

        # optionally build sememe dictionary
        sememe_dict = Dictionary()
        Dictionary.add_file_to_dictionary(os.path.join(paths[0],
                                                       'train.sememe'),
                                          sememe_dict,
                                          tokenizer.tokenize_line,
                                          num_workers=12)
        args.sememe_dict = sememe_dict

        assert src_dict.pad() == tgt_dict.pad()
        assert src_dict.eos() == tgt_dict.eos()
        assert src_dict.unk() == tgt_dict.unk()
        print('| [{}] dictionary: {} types'.format(args.source_lang,
                                                   len(src_dict)))
        print('| [{}] dictionary: {} types'.format(args.target_lang,
                                                   len(tgt_dict)))
        print('| [{}] dictionary: {} types'.format('sememe', len(sememe_dict)))

        return cls(args, src_dict, tgt_dict, sememe_dict)
 def build_dictionary(filenames, src=False, tgt=False):
     assert src ^ tgt
     workers = args.workers
     threshold = args.thresholdsrc if src else args.thresholdtgt
     nwords = args.nwordssrc if src else args.nwordstgt
     padding_factor = args.padding_factor
     d = Dictionary()
     for filename in filenames:
         Dictionary.add_file_to_dictionary(filename, d,
                                           tokenizer.tokenize_line, workers,
                                           args.L)
     d.finalize(threshold=threshold,
                nwords=nwords,
                padding_factor=padding_factor)
     return d
Example #7
0
    def build_dictionary(cls,
                         filenames,
                         workers=1,
                         threshold=-1,
                         nwords=-1,
                         padding_factor=8):
        """Build the dictionary

        Args:
            filenames (list): 文件名列表
            workers (int):  并发的线程数
            threshold (int): 定义最小出现的次数
            nwords (int): 定义最终词典中的单词总数,包括特殊符号
            padding_factor (int): 可用于将字典大小填充为8的倍数,这在某些硬件上很重要 (e.g., Nvidia Tensor Cores).
        """
        d = Dictionary()
        for filename in filenames:
            Dictionary.add_file_to_dictionary(filename, d,
                                              tokenizer.tokenize_line, workers)
        d.finalize(threshold=threshold,
                   nwords=nwords,
                   padding_factor=padding_factor)
        return d
Example #8
0
    def build_dictionary(cls, filenames, workers=1, threshold=-1, nwords=-1,
                         padding_factor=8):
        """Build the dictionary from edit-labeled raw text inputs.

        Each file contains tokenized sentences along with their token labels:
        ```text
        My teacher is going to move to change his job .
        0 0 0 0 0 0 0 0 0 0 0
        And he took in my favorite subject like soccer .
        0 0 0 0 0 0 1 0 0 0
        ...
        ```
        A dictionary is built using only the tokens and not token labels.

        Args:
            filenames (list): list of filenames
            workers (int): number of concurrent workers
            threshold (int): defines the minimum word count
            nwords (int): defines the total number of words in the final dictionary,
                including special symbols
            padding_factor (int): can be used to pad the dictionary size to be a
                multiple of 8, which is important on some hardware (e.g., Nvidia
                Tensor Cores).
        """
        d = Dictionary()
        for filename in filenames:
            # Write only tokens to a separate file.
            with open(filename) as f_in, \
                    open(f"{filename}.tokens", "w") as f_out:
                f_out.writelines(line for i, line in enumerate(f_in)
                                 if i % 2 == 0)
            # Add tokens to dictionary with multiprocessing.
            Dictionary.add_file_to_dictionary(f"{filename}.tokens", d,
                                              tokenizer.tokenize_line, workers)
        d.finalize(threshold=threshold, nwords=nwords,
                   padding_factor=padding_factor)
        return d
Example #9
0
    def test_add_file_to_dict(self):
        counts = {}
        num_lines = 100
        per_line = 10
        with tempfile.TemporaryDirectory("test_sampling") as data_dir:
            filename = os.path.join(data_dir, "dummy.txt")
            with open(filename, "w", encoding="utf-8") as data:
                for c in string.ascii_letters:
                    line = f"{c} " * per_line
                    for _ in range(num_lines):
                        data.write(f"{line}\n")
                    counts[c] = per_line * num_lines
                    per_line += 5

            dict = Dictionary()
            Dictionary.add_file_to_dictionary(filename, dict,
                                              tokenizer.tokenize_line, 10)
            dict.finalize(threshold=0, nwords=-1, padding_factor=8)

            for c in string.ascii_letters:
                count = dict.get_count(dict.index(c))
                self.assertEqual(
                    counts[c], count,
                    f"{c} count is {count} but should be {counts[c]}")