def make_binary_dataset(vocab: Dictionary, input_file, output_file, attr: str, num_workers: int): """make binary dataset""" LOGGER.info("[{}] Dictionary: {} types".format(attr, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() # save un-recorded tokens def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] # split a file into different parts # if use multi-processing, we first process 2nd to last file # 1.txt -> 10 processor, 0(p0)(0-99), 100(p1)(100-199), ... offsets = Binarizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async(binarize, (args, input_file, vocab, prefix, attr, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() # process 1th file, if multi-processing available. If not, process all file # p0 -> 0,end ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder( ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) merge_result( Binarizer.binarize_bpe(input_file, vocab, lambda t: ds.add_item(t), offset=0, end=offsets[1])) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize('{}.idx'.format(output_file)) LOGGER.info( "[{}] {}: {} sents, {} tokens, BPE no replaced token".format( attr, input_file, n_seq_tok[0], n_seq_tok[1], ))
def binarize(args: Dict, filename: str, dict: Dictionary, in_file: str, offset: int, end: int, append_eos: bool = True): """binarize function for multi-processing""" ds_file = '{}.mmap'.format(in_file) ds = indexed_dataset.make_builder(ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(dict)) def consumer(tensor): ds.add_item(tensor) res = Binarizer.binarize(filename, dict, consumer, tokenize=tokenization.json_tokenizer, append_eos=append_eos, offset=offset, end=end) ds.finalize('{}.idx'.format(in_file)) return res
def binarize(args, filename: str, vocab, aux_dict, in_file: str, lang, tokenize, max_path_num: int, offset: int, end: int, append_eos: bool = False): ds_file = '{}.mmap'.format(in_file) ds = indexed_dataset.make_builder(ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) if lang == 'path': sz_ds_file = '{}.sz.mmap'.format(in_file) sz_ds = indexed_dataset.make_builder(sz_ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) else: sz_ds = None def consumer(tensor, size=None): ds.add_item(tensor) if size is not None: sz_ds.add_item(size) if sz_ds is None: res = Binarizer.binarize(filename, vocab, consumer, tokenize=tokenize, append_eos=append_eos, offset=offset, end=end, ) ds.finalize('{}.idx'.format(in_file)) else: res = PathSummarizationBinarizer.path_binarizer(filename, vocab, consumer, tokenize=tokenize, append_eos=append_eos, offset=offset, end=end, type_dict=aux_dict, max_path_num=max_path_num, ) ds.finalize('{}.idx'.format(in_file)) sz_ds.finalize('{}.sz.idx'.format(in_file)) return res
def make_graph_bin_dataset(dict: Dictionary, input_file, output_file, num_workers): offsets = Binarizer.find_offsets(input_file, num_workers) if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers) for worker_id in range(num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async( binarize_dgl, (args, input_file, dict, prefix, offsets[worker_id], offsets[worker_id + 1]), ) pool.close() else: prefix = "{}0".format(output_file) binarize_dgl(args, input_file, dict, prefix, 0, -1)
def binarize(args: Dict, filename: str, dict: Dictionary, out_file_prefix: str, attr: str, offset: int, end: int): """binarize function for multi-processing""" ds_file = '{}.mmap'.format(out_file_prefix) ds = indexed_dataset.make_builder(ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(dict)) def consumer(tensor): ds.add_item(tensor) res = Binarizer.binarize_bpe(filename, dict, consumer, offset=offset, end=end) ds.finalize('{}.idx'.format(out_file_prefix)) return res
def make_binary_dataset(vocab, aux_dict, input_file, output_file, lang, max_path_num, num_workers): n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] tokenize = path_tokenize if lang == 'path' else tokenization.json_tokenizer offsets = file_io.find_offsets(input_file, num_workers) pool = None if num_workers > 1: pool = Pool(num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_file, worker_id) pool.apply_async( binarize, ( args, input_file, vocab, aux_dict, prefix, lang, tokenize, max_path_num, offsets[worker_id], offsets[worker_id + 1], ), callback=merge_result ) pool.close() ds_file = '{}.mmap'.format(output_file) ds = indexed_dataset.make_builder(ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) if lang == 'path': sz_ds_file = '{}.sz.mmap'.format(output_file) sz_ds = indexed_dataset.make_builder(sz_ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(vocab)) else: sz_ds = None def consumer(tensor, size=None): ds.add_item(tensor) if size is not None: sz_ds.add_item(size) if sz_ds is None: merge_result( Binarizer.binarize( input_file, vocab, consumer, tokenize=tokenize, offset=0, end=offsets[1], append_eos=False, max_path_num=max_path_num, ) ) else: merge_result( PathSummarizationBinarizer.path_binarizer( input_file, vocab, consumer, tokenize=tokenize, offset=0, end=offsets[1], append_eos=False, type_dict=aux_dict, max_path_num=max_path_num, ) ) if num_workers > 1: # p1-pN pool.join() # merge sub-processors' index and data files into final files and delete them. for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(output_file, worker_id) ds.merge_file_(temp_file_path) if sz_ds is not None: sz_ds.merge_file_(f"{temp_file_path}.sz") # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) if sz_ds is not None: os.remove(indexed_dataset.data_file_path(f"{temp_file_path}.sz")) os.remove(indexed_dataset.index_file_path(f"{temp_file_path}.sz")) ds.finalize('{}.idx'.format(output_file)) if sz_ds is not None: sz_ds.finalize('{}.sz.idx'.format(output_file)) LOGGER.info( "[{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format( lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, ) )