def path_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename_terminals, dest_filename = dest_filename + '.terminals' + str(idx), dest_filename + str(idx) with file_io.open(filename, "r") as reader, file_io.open(dest_filename_terminals, 'w') as writer_terminals, \ file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break ast = json_io.json_loads(line) if ast: paths = util_path.ast_to_path(ast, MAX_PATH=PATH_NUM) if paths is None: paths = [[None] * 3] * PATH_NUM else: # copy paths size to PATH_NUM if len(paths) < PATH_NUM: supply_ids = list(range(len(paths))) * ((PATH_NUM - len(paths)) // len(paths)) \ + random.sample(range(len(paths)), ((PATH_NUM - len(paths)) % len(paths))) paths.extend([paths[idx] for idx in supply_ids]) random.shuffle(paths) assert len(paths) == PATH_NUM head, body, tail = zip(*paths) else: head, body, tail = [None] * PATH_NUM, [None] * PATH_NUM, [None] * PATH_NUM # terminals for terminal in itertools.chain(*zip(head, tail)): print(json_io.json_dumps(terminal), file=writer_terminals) # path for b in body: print(json_io.json_dumps(b), file=writer) line = safe_readline(reader)
def flatten_attrs(raw_file, flatten_dir, mode, attrs, start=0, end=-1): attr_writers = {} for attr in attrs: attr_file = os.path.join(flatten_dir, '{}.{}'.format(mode, attr)) os.makedirs(os.path.dirname(attr_file), exist_ok=True) attr_writers[attr] = file_io.open(attr_file, 'w') with file_io.open(raw_file, 'r') as reader: reader.seek(start) line = file_io.safe_readline(reader) while line: if end > 0 and reader.tell() > end: break filename = os.path.join(os.path.dirname(raw_file), line.strip()) # tokens, types = parse_file(filename) try: tokens, types = parse_file(filename) # replace None with [PAD] for type dictionary build types = [PAD if t is None else t for t in types] except Exception as err: # print(err) # print(f'parsing {filename} error') line = file_io.safe_readline(reader) continue print(json_io.json_dumps(tokens), file=attr_writers['code_tokens']) print(json_io.json_dumps(types), file=attr_writers['code_types']) line = file_io.safe_readline(reader)
def code_tokens_fn(filename, dest_filename, idx, start=0, end=-1, *args): """code => raw_ast""" kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) PathManager.mkdir(os.path.dirname(dest_filename)) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break code_tokens = json_io.json_loads(line) if code_tokens: # filter comment in code_tokens, eg. //***\n /* */\n code_tokens = [token for token in code_tokens if not (str.startswith(token, '//') or str.startswith(token, '#') or \ (str.startswith(token, '/*') and str.endswith(token, '*/'))) ] if not all(str.isascii(token) for token in code_tokens): code_tokens = None if code_tokens is None or len(code_tokens) < 1: code_tokens = None else: code_tokens = None print(json_io.json_dumps(code_tokens), file=writer) line = safe_readline(reader)
def tokenization( in_file, out_file, lang, attr, start=0, end=-1, ): with file_io.open(in_file, "r") as reader, file_io.open(out_file, 'w') as writer: reader.seek(start) line = file_io.safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line).strip() if lang == 'python' and attr == 'code': tokens = python_code_tokenize(line) line = ' '.join(tokens).strip() if attr == 'code': line = normalize_program(line, remove_eol=True) else: line = normalize_docstring(line, remove_eol=True, remove_url=True) line = line.strip() tokens = tokenizer.encode_as_pieces(line) print(json_io.json_dumps(tokens), file=writer) line = file_io.safe_readline(reader)
def raw_ast_fn(filename, dest_filename, idx, start=0, end=-1, *args): """code => raw_ast""" kwargs = args[0][0] # canot feed dict parameters in multi-processing lang = kwargs.get('lang') so_dir = kwargs.get('so_dir') so_filename = os.path.join(os.path.expanduser(so_dir), '{}.so'.format(lang)) parser = TreeSitterASTParser(so_filename, lang) dest_filename = dest_filename + str(idx) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break code = json_io.json_loads(line) if code: raw_ast = parser.parse_raw_ast(code) else: raw_ast = None print(json_io.json_dumps(raw_ast), file=writer) line = safe_readline(reader)
def docstring_tokens_fn(filename, dest_filename, idx, start=0, end=-1, *args): """code => raw_ast""" kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) PathManager.mkdir(os.path.dirname(dest_filename)) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break docstring_tokens = json_io.json_loads(line) if docstring_tokens: docstring_tokens = [ token for token in docstring_tokens \ if not (re.match(r'[\-|\*|\=|\~]{2,}', token) or re.match(r'<.*?>', token)) ] if not all( str.isascii(token) for token in docstring_tokens): docstring_tokens = None if (docstring_tokens is None) or not (3 < len(docstring_tokens) <= 50): docstring_tokens = None else: docstring_tokens = None print(json_io.json_dumps(docstring_tokens), file=writer) line = safe_readline(reader)
def binary_ast_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break ast = json_io.json_loads(line) if ast: try: ast = util_ast.value2children(ast) ast = util_ast.remove_root_with_uni_child(ast) root_idx = util_ast.get_root_idx(ast) ast = util_ast.delete_node_with_uni_child(ast, idx=root_idx) root_idx = util_ast.get_root_idx(ast) bin_ast = util_ast.binarize_tree(ast, idx=root_idx) # to binary ast tree root_idx = util_ast.get_root_idx(ast) bin_ast = util_ast.reset_indices(bin_ast, root_idx) # reset node indices bin_ast = util_ast.pad_leaf_node(bin_ast, MAX_SUB_TOKEN_LEN) except RecursionError: LOGGER.error('RecursionError, ignore this tree') bin_ast = None except Exception as err: LOGGER.error(err) bin_ast = None else: bin_ast = None print(json_io.json_dumps(bin_ast), file=writer) line = safe_readline(reader)
def _add_tok_to_dictionary_single_worker( filename: str, tokenize: Any, eos_word: Optional[str], worker_id: int = 0, num_workers: int = 1, ) -> Counter: counter = Counter() with file_io.open(filename, "r") as f: size = os.fstat(f.fileno()).st_size chunk_size = size // num_workers offset = worker_id * chunk_size end = offset + chunk_size f.seek(offset) if offset > 0: safe_readline(f) # drop first incomplete line line = f.readline() while line: tokens = tokenize(line) counter.update(tokens) if eos_word is not None: counter.update([eos_word]) if f.tell() > end: break line = f.readline() return counter
def find_offsets(filename, num_chunks): with open(filename, "r", encoding="utf-8") as f: size = os.fstat(f.fileno()).st_size chunk_size = size // num_chunks offsets = [0 for _ in range(num_chunks + 1)] for i in range(1, num_chunks): f.seek(chunk_size * i) safe_readline(f) offsets[i] = f.tell() return offsets
def ast_fn(filename, dest_filename, idx, start=0, end=-1): dest_filename = dest_filename + str(idx) with file_io.open(filename, "r", encoding="UTF-8") as reader, open(dest_filename, 'w') as writer: reader.seek(start) line = file_io.safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line) ast = convert(line) print(json_io.json_dumps(ast), file=writer) line = file_io.safe_readline(reader)
def ast_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing parser = CodeParser(SO_FILE=os.path.join(kwargs['so_dir'], f"{kwargs['lang']}.so"), LANGUAGE=kwargs['lang']) dest_filename = f"{dest_filename}{idx}" with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line) ast = parser.parse_raw_ast(code=line, MAX_AST_SIZE=99999999999, append_index=True) print(json_io.json_dumps(ast), file=writer) line = safe_readline(reader)
def docstring_fn(filename, dest_filename, idx, start=0, end=-1, *args): """code => raw_ast""" kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) PathManager.mkdir(os.path.dirname(dest_filename)) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break docstring = json_io.json_loads(line) print(json_io.json_dumps(docstring), file=writer) line = safe_readline(reader)
def binarize_bpe( filename, dict, consumer, reverse_order=False, offset=0, end=-1, ): nseq, ntok = 0, 0 # nseq = sentence number, ntok = token number replaced = Counter() # un-recorded tokens with open(filename, "r", encoding="utf-8") as f: f.seek(offset) # next(f) breaks f.tell(), hence readline() must be used line = safe_readline(f) while line: if end > 0 and f.tell() > end: break line = ujson.loads(line) line = ' '.join(line) if isinstance(line, list) else line ids = dict.encode_ids(line) if reverse_order: words = list(reversed(words)) ids = torch.IntTensor(ids) nseq += 1 ntok += len(ids) consumer(ids) line = f.readline() return { "nseq": nseq, "nunk": sum(replaced.values()), "ntok": ntok, "replaced": replaced, }
def build_dgl_graph(vocab, input_file, output_file, start=0, end=-1): graph_batch = [] with open(input_file, 'r') as reader: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break ast = ujson.loads(line) if ast is None: graph = dgl.DGLGraph() else: graph = tree2dgl(ast, vocab) graph_batch.append(graph) line = safe_readline(reader) save_graphs(output_file, graph_batch)
def func_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break func_name = json_io.json_loads(line) func = func_name.split('.')[-1] print(json_io.json_dumps(func), file=writer) line = safe_readline(reader)
def dfs_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = f"{dest_filename}{idx}" with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break ast = json_io.json_loads(line) if ast is not None: dfs, _ = ast_to_dfs(ast) else: dfs = None print(json_io.json_dumps(dfs), file=writer) line = safe_readline(reader)
def path_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break ast = json_io.json_loads(line) if ast: paths = util_path.ast_to_path(ast) print(json_io.json_dumps(paths), file=writer) line = safe_readline(reader)
def traversal_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break ast = json_io.json_loads(line) if ast: ast_traversal = util_traversal.get_dfs(ast) else: ast_traversal = None print(json_io.json_dumps(ast_traversal), file=writer) line = safe_readline(reader)
def binarize( filename, dict, # Ditionary consumer, tokenize=tokenize_string, append_eos=True, reverse_order=False, offset=0, end=-1, already_numberized=False, **kwargs, ): nseq, ntok = 0, 0 # nseq = sentence number, ntok = token number replaced = Counter() # un-recorded tokens def replaced_consumer(word, idx): """save un-recorded token""" if idx == dict.unk_index and word != dict.unk_word: replaced.update([word]) with open(filename, "r", encoding="utf-8") as f: f.seek(offset) # next(f) breaks f.tell(), hence readline() must be used line = safe_readline(f) while line: if end > 0 and f.tell() > end: break if already_numberized: id_strings = line.strip().split() id_list = [int(id_string) for id_string in id_strings] if reverse_order: id_list.reverse() if append_eos: id_list.append(dict.eos()) ids = torch.IntTensor(id_list) else: ids = dict.encode_line( line=line, line_tokenizer=tokenize, add_if_not_exist=False, consumer=replaced_consumer, append_eos=append_eos, reverse_order=reverse_order, **kwargs, ) nseq += 1 ntok += len(ids) consumer(ids) line = f.readline() return { "nseq": nseq, "nunk": sum(replaced.values()), "ntok": ntok, "replaced": replaced, }
def sbtao_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing dest_filename = dest_filename + str(idx) with file_io.open(filename, "r") as reader, file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) while line: if end > 0 and reader.tell() > end: break ast = json_io.json_loads(line) if ast: ast = util_ast.value2children(ast) padded_ast = util_ast.pad_leaf_node(ast, MAX_SUB_TOKEN_LEN) root_idx = util_ast.get_root_idx(padded_ast) sbt = util_ast.build_sbtao_tree(padded_ast, idx=root_idx) else: sbt = None print(json_io.json_dumps(sbt), file=writer) line = safe_readline(reader)
def path_binarizer(filename, subtoken_dict, consumer, tokenize=None, append_eos=True, reverse_order=False, offset=0, end=-1, type_dict=None, **kwargs): nseq, ntok = 0, 0 # nseq = sentence number, ntok = token number replaced = Counter() # un-recorded tokens def binarization(parts, dict): part_sizes = [len(p) for p in parts] parts = list(itertools.chain(*parts)) parts = torch.Tensor([dict.index(token) for token in parts]).long() parts = parts.split(part_sizes, dim=0) return parts def encode_path(line, ): heads, bodies, tails = tokenize( line, max_path_num=kwargs['max_path_num']) heads = binarization(heads, subtoken_dict) bodies = binarization(bodies, type_dict) tails = binarization(tails, subtoken_dict) paths, path_sizes = [], [] for head, body, tail in zip(heads, bodies, tails): paths.extend([head, body, tail]) path_sizes.extend([len(head), len(body), len(tail)]) paths = torch.cat(paths, dim=0) path_sizes = torch.Tensor(path_sizes).long() assert len(paths) == path_sizes.sum().item() return paths, path_sizes with file_io.open(filename, "r", encoding="utf-8") as f: f.seek(offset) # next(f) breaks f.tell(), hence readline() must be used line = file_io.safe_readline(f) while line: if end > 0 and f.tell() > end: break paths, path_sizes = encode_path(line) ntok += len(paths) consumer(paths, path_sizes) line = f.readline() return { "nseq": nseq, "nunk": sum(replaced.values()), "ntok": ntok, "replaced": replaced, }
def binarize(args, in_file: str, out_file: str, vocab, token_dict, offset: int, end: int): ds = indexed_dataset.make_builder(f"{out_file}.mmap", impl='mmap', vocab_size=len(vocab)) with file_io.open(in_file, 'r') as reader: reader.seek(offset) line = file_io.safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line) code_tokens = vocab.encode(line, out_type=str) code_tokens = torch.IntTensor([token_dict.index(token) for token in code_tokens]) ds.add_item(code_tokens) line = reader.readline() ds.finalize(f'{out_file}.idx')
def binarize_trav_trans( filename, dicts, # (token_dict, mask_dict) consumer, # (data, ext, ids, ) tokenize=tokenize_string, offset=0, end=-1, ): nseq, ntok = 0, 0 # nseq = sentence number, ntok = token number token_dict, mask_dict = dicts replaced = Counter() # un-recorded tokens def replaced_consumer(word, idx): """save un-recorded token""" if idx == token_dict.unk_index and word != token_dict.unk_word: replaced.update([word]) with open(filename, "r", encoding="utf-8") as f: f.seek(offset) # next(f) breaks f.tell(), hence readline() must be used line = safe_readline(f) while line: if end > 0 and f.tell() > end: break for data, ext, ids, mask in tokenize(line): data = token_dict.encode_list(data, add_if_not_exist=False, consumer=replaced_consumer) ext = torch.IntTensor([ext]) if ids: for key, value in ids.items(): if len(value) == 0: ids[key] = torch.IntTensor([-1]) else: ids[key] = torch.IntTensor(value) if mask: mask = mask_dict.encode_list(mask, add_if_not_exist=False) consumer(data, ext, ids, mask) nseq += 1 ntok += len(data) line = f.readline() return { "nseq": nseq, "nunk": sum(replaced.values()), "ntok": ntok, "replaced": replaced, }
def code_wo_func_fn(filename, dest_filename, idx, start=0, end=-1, *args): kwargs = args[0][0] # canot feed dict parameters in multi-processing func_filename = filename[:str.rfind(filename, '.')] + '.func' dest_filename = dest_filename + str(idx) with file_io.open(filename, "r") as reader, open(func_filename, 'r') as func_reader, \ file_io.open(dest_filename, 'w') as writer: reader.seek(start) line = safe_readline(reader) func_line = safe_readline(func_reader) while line and func_line: if end > 0 and reader.tell() > end: break code = json_io.json_loads(line) func_name = json_io.json_loads(func_line) start_idx = str.find(code, func_name) if start_idx != -1: code_wo_func = code[:start_idx] + code[start_idx + len(func_name):] else: code_wo_func = None print(json_io.json_dumps(code_wo_func), file=writer) line = safe_readline(reader) func_line = safe_readline(func_reader)
def tokenization( in_file, out_file, lang, attr, start=0, end=-1, ): with file_io.open(in_file, "r") as reader, file_io.open(out_file, 'w') as writer: reader.seek(start) line = file_io.safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line).strip() if lang == 'python' and attr == 'code': line = re.sub(r'\s+', ' ', line) line = line.strip() tokens = tokenizer.encode_as_pieces(line) print(json_io.json_dumps(tokens), file=writer) line = file_io.safe_readline(reader)
def binarize_alignments(filename, alignment_parser, consumer, offset=0, end=-1): nseq = 0 with file_io.open(filename, "r") as f: f.seek(offset) line = safe_readline(f) while line: if end > 0 and f.tell() > end: break ids = alignment_parser(line) nseq += 1 consumer(ids) line = f.readline() return {"nseq": nseq}
def binarize_seperate( filename, dict, consumer, tokenize=None, append_eos=True, reverse_order=False, offset=0, end=-1, ): nseq, ntok = 0, 0 # nseq = sentence number, ntok = token number replaced = Counter() # un-recorded tokens def replaced_consumer(word, idx): """save un-recorded token""" if idx == dict.unk_index and word != dict.unk_word: replaced.update([word]) with file_io.open(filename, "r", encoding="utf-8") as f: f.seek(offset) # next(f) breaks f.tell(), hence readline() must be used line = file_io.safe_readline(f) while line: if end > 0 and f.tell() > end: break ids_ext = dict.encode_line( line=line, line_tokenizer=tokenize, add_if_not_exist=False, consumer=replaced_consumer, append_eos=append_eos, reverse_order=reverse_order, ) if len(ids_ext) > 0: nseq += 1 for ids, ext in ids_ext: ntok += len(ids) consumer(ids, ext) line = f.readline() return { "nseq": nseq, "nunk": sum(replaced.values()), "ntok": ntok, "replaced": replaced, }
def binarize_dfs(args, filename: str, dict, in_file: str, offset: int, end: int): """binarize function for multi-processing""" ds_file = '{}.mmap'.format(in_file) ds = indexed_dataset.make_builder(ds_file, impl=args['preprocess']['dataset_impl'], vocab_size=len(dict)) with file_io.open(filename, 'r') as reader: reader.seek(offset) line = file_io.safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line) dfs = torch.IntTensor([dict.index(tok) for tok in line]) ds.add_item(dfs) line = reader.readline() ds.finalize('{}.idx'.format(in_file))
def main(args): LOGGER.info('mkdir {} for {} task'.format(args['preprocess']['destdir'], args['preprocess']['task'])) PathManager.mkdir(args['preprocess']['destdir']) vocab = spm.SentencePieceProcessor() vocab.load(SPM_VOCAB_FILE) def save_token_dict(): src_file = os.path.join(os.path.dirname(SPM_VOCAB_FILE), 'dict.txt') tgt_file = os.path.join(args['preprocess']['destdir'], 'dict.jsonl') # Dictionary.text_to_jsonl(src_file, tgt_file) vocab = Dictionary() with file_io.open(src_file, 'r') as reader: for line in reader: token, num = line.strip().split() vocab.add_symbol(token, eval(num)) vocab.save(tgt_file) return vocab token_dict = save_token_dict() # 2. ***************build dataset******************** # dump into pkl file # transform a language's code into src format and tgt format simualtaneouly num_workers = args['preprocess']['workers'] src_lang, tgt_lang = args['preprocess']['src_lang'], args['preprocess']['tgt_lang'] for lang, mode in itertools.product([src_lang, tgt_lang], MODES): src_file = args['preprocess'][f'{mode}pref'].replace('*', lang) + ".code" dst_file = os.path.join(args['preprocess']['destdir'], lang, f"{mode}.code_tokens") PathManager.mkdir(os.path.dirname(dst_file)) offsets = find_offsets(src_file, num_workers) pool = None if num_workers > 1: # p1-pN -> (1 bin-txt, 1 idx), (N bin-txt, N idx) pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(dst_file, worker_id) pool.apply_async( binarize, ( args, src_file, prefix, vocab, token_dict, offsets[worker_id], offsets[worker_id + 1] ), ) pool.close() ds = indexed_dataset.make_builder(f"{dst_file}.mmap", impl='mmap', vocab_size=len(vocab)) end = offsets[1] with file_io.open(src_file, 'r') as reader: reader.seek(0) line = file_io.safe_readline(reader) while line: if end > 0 and reader.tell() > end: break line = json_io.json_loads(line) code_tokens = vocab.encode(line, out_type=str) code_tokens = torch.IntTensor([token_dict.index(token) for token in code_tokens]) ds.add_item(code_tokens) line = reader.readline() if num_workers > 1: pool.join() for worker_id in range(1, num_workers): temp_file_path = "{}{}".format(dst_file, worker_id) ds.merge_file_(temp_file_path) # idx, txt os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(f"{dst_file}.idx")
def binarize( filename, dict, consumer, tokenize=None, use_func=False, append_eos=True, reverse_order=False, offset=0, end=-1, func_offset=0, already_numberized=False, **kwargs, ): nseq, ntok = 0, 0 # nseq = sentence number, ntok = token number replaced = Counter() # un-recorded tokens def replaced_consumer(word, idx): """save un-recorded token""" if idx == dict.unk_index and word != dict.unk_word: replaced.update([word]) with file_io.open(filename, "r", encoding="utf-8") as f: f.seek(offset) if use_func: func_reader = file_io.open( filename[:str.rfind(filename, '.')] + '.func_name', 'r') func_reader.seek(func_offset) line = safe_readline(f) func_name = safe_readline(func_reader) if use_func else None while line: if end > 0 and f.tell() > end: break if already_numberized: id_strings = line.strip().split() id_list = [int(id_string) for id_string in id_strings] if reverse_order: id_list.reverse() if append_eos: id_list.append(dict.eos()) ids = torch.IntTensor(id_list) else: ids = dict.encode_line( line=line, line_tokenizer=tokenize, func_name=func_name, add_if_not_exist=False, consumer=replaced_consumer, append_eos=append_eos, reverse_order=reverse_order, **kwargs, ) nseq += 1 ntok += len(ids) consumer(ids) line = f.readline() func_name = safe_readline(func_reader) if use_func else None if use_func: func_reader.close() return { "nseq": nseq, "nunk": sum(replaced.values()), "ntok": ntok, "replaced": replaced, }