def main() -> None: parser = argparse.ArgumentParser( description="Build vocabulary from corpus data.") parser.add_argument( "--corpus-data", type=str, required=True, help= "The path pattern (glob) to all tokenized corpus files (train, test, val)." ) parser.add_argument("--langs", type=str, required=True, help="The pre-trained model languages.") parser.add_argument("--output", type=str, required=True, help="The vocabulary file.") args = parser.parse_args() langs = args.langs.split(",") ft_dict = Dictionary() for data_path in glob(args.corpus_data): Dictionary.add_file_to_dictionary(data_path, ft_dict, tokenize_line, 4) ft_dict.finalize(padding_factor=0) pad_dict(ft_dict, len(langs) + 1) ft_dict.save(args.output)
def build_dictionary(cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8): """Build the dictionary Args: filenames (list): list of filenames workers (int): number of concurrent workers threshold (int): defines the minimum word count nwords (int): defines the total number of words in the final dictionary, including special symbols padding_factor (int): can be used to pad the dictionary size to be a multiple of 8, which is important on some hardware (e.g., Nvidia Tensor Cores). """ d = Dictionary() for filename in filenames: Dictionary.add_file_to_dictionary(filename, d, tokenizer.tokenize_line, workers) d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor) return d
def build_dictionary(cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8): print(f'Building BERT Dictionary') d = BertDictionary() for filename in filenames: Dictionary.add_file_to_dictionary(filename, d, tokenizer.tokenize_line, workers) d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor) return d
def build_dict(cls, filenames, word_level=False, workers=1, threshold=-1, nwords=-1, padding_factor=8): d = Dictionary() for filename in filenames: Dictionary.add_file_to_dictionary(filename, d, tokenize_line_word if word_level else tokenize_line_char, workers) d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor) return d
def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ args.left_pad_source = options.eval_bool(args.left_pad_source) args.left_pad_target = options.eval_bool(args.left_pad_target) if getattr(args, 'raw_text', False): utils.deprecation_warning( '--raw-text is deprecated, please use --dataset-impl=raw') args.dataset_impl = 'raw' elif getattr(args, 'lazy_load', False): utils.deprecation_warning( '--lazy-load is deprecated, please use --dataset-impl=lazy') args.dataset_impl = 'lazy' paths = args.data.split(':') assert len(paths) > 0 # find language pair automatically if args.source_lang is None or args.target_lang is None: raise Exception( 'Could not infer language pair, please provide it explicitly') # load dictionaries src_dict = cls.load_dictionary( os.path.join(paths[0], 'dict.{}.txt'.format(args.source_lang))) tgt_dict = cls.load_dictionary( os.path.join(paths[0], 'dict.{}.txt'.format(args.target_lang))) # optionally build sememe dictionary sememe_dict = Dictionary() Dictionary.add_file_to_dictionary(os.path.join(paths[0], 'train.sememe'), sememe_dict, tokenizer.tokenize_line, num_workers=12) args.sememe_dict = sememe_dict assert src_dict.pad() == tgt_dict.pad() assert src_dict.eos() == tgt_dict.eos() assert src_dict.unk() == tgt_dict.unk() print('| [{}] dictionary: {} types'.format(args.source_lang, len(src_dict))) print('| [{}] dictionary: {} types'.format(args.target_lang, len(tgt_dict))) print('| [{}] dictionary: {} types'.format('sememe', len(sememe_dict))) return cls(args, src_dict, tgt_dict, sememe_dict)
def build_dictionary(filenames, src=False, tgt=False): assert src ^ tgt workers = args.workers threshold = args.thresholdsrc if src else args.thresholdtgt nwords = args.nwordssrc if src else args.nwordstgt padding_factor = args.padding_factor d = Dictionary() for filename in filenames: Dictionary.add_file_to_dictionary(filename, d, tokenizer.tokenize_line, workers, args.L) d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor) return d
def build_dictionary(cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8): """Build the dictionary Args: filenames (list): 文件名列表 workers (int): 并发的线程数 threshold (int): 定义最小出现的次数 nwords (int): 定义最终词典中的单词总数,包括特殊符号 padding_factor (int): 可用于将字典大小填充为8的倍数,这在某些硬件上很重要 (e.g., Nvidia Tensor Cores). """ d = Dictionary() for filename in filenames: Dictionary.add_file_to_dictionary(filename, d, tokenizer.tokenize_line, workers) d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor) return d
def build_dictionary(cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8): """Build the dictionary from edit-labeled raw text inputs. Each file contains tokenized sentences along with their token labels: ```text My teacher is going to move to change his job . 0 0 0 0 0 0 0 0 0 0 0 And he took in my favorite subject like soccer . 0 0 0 0 0 0 1 0 0 0 ... ``` A dictionary is built using only the tokens and not token labels. Args: filenames (list): list of filenames workers (int): number of concurrent workers threshold (int): defines the minimum word count nwords (int): defines the total number of words in the final dictionary, including special symbols padding_factor (int): can be used to pad the dictionary size to be a multiple of 8, which is important on some hardware (e.g., Nvidia Tensor Cores). """ d = Dictionary() for filename in filenames: # Write only tokens to a separate file. with open(filename) as f_in, \ open(f"{filename}.tokens", "w") as f_out: f_out.writelines(line for i, line in enumerate(f_in) if i % 2 == 0) # Add tokens to dictionary with multiprocessing. Dictionary.add_file_to_dictionary(f"{filename}.tokens", d, tokenizer.tokenize_line, workers) d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor) return d
def test_add_file_to_dict(self): counts = {} num_lines = 100 per_line = 10 with tempfile.TemporaryDirectory("test_sampling") as data_dir: filename = os.path.join(data_dir, "dummy.txt") with open(filename, "w", encoding="utf-8") as data: for c in string.ascii_letters: line = f"{c} " * per_line for _ in range(num_lines): data.write(f"{line}\n") counts[c] = per_line * num_lines per_line += 5 dict = Dictionary() Dictionary.add_file_to_dictionary(filename, dict, tokenizer.tokenize_line, 10) dict.finalize(threshold=0, nwords=-1, padding_factor=8) for c in string.ascii_letters: count = dict.get_count(dict.index(c)) self.assertEqual( counts[c], count, f"{c} count is {count} but should be {counts[c]}")