def main(args): source, target, output = args.source, args.target, args.output_path assert all([file_exists(x) for x in [source, target]]) src_contents = load_file_contents(source) tgt_contents = load_file_contents(target) src_cnt, tgt_cnt = len(src_contents), len(tgt_contents) assert src_cnt <= tgt_cnt it_print('source {} lines, target {} lines'.format(src_cnt, tgt_cnt)) if output is None: output = 'mapping.json' src_mapping = OrderedDict({k + 1: src_contents[k] for k in range(src_cnt)}) tgt_mapping = {k + 1: tgt_contents[k] for k in range(tgt_cnt)} mapping = OrderedDict() with tqdm(total=src_cnt, disable=not args.verbose) as pb: src_keys = list(sorted(src_mapping.keys())) for key in src_keys: sub, value = None, src_mapping[key] for sub in tgt_mapping: if value == tgt_mapping[sub]: mapping[key] = sub break if sub is not None: it_print('{} -> {}'.format(key, sub)) src_mapping.pop(key) tgt_mapping.pop(sub) pb.update(1) write_file_contents(output, to_json(mapping, indent=2)) write_file_contents('source.left.json', to_json(src_mapping, indent=2)) write_file_contents('target.left.json', to_json(tgt_mapping, indent=2))
def execute(source, target, co_duplicated, ext='.dedup', verbose=False): duplicated = co_duplicated.pop('linenos') src_contents = load_file_contents(source, strip=False) tgt_contents = load_file_contents(target, strip=False) total = len(src_contents) assert total == len(tgt_contents) src_lines, tgt_lines = [], [] iterator = zip(src_contents, tgt_contents) for lineno, (src, tgt) in enumerate(iterator, start=1): if verbose and lineno % 10000 == 0: it_print('processed {}'.format(lineno)) if lineno in duplicated: duplicated.remove(lineno) continue # write to file src_lines.append(src) tgt_lines.append(tgt) count = len(src_lines) assert count == len(tgt_lines) it_print('total {} lines, after filter, {} left'.format(total, count)) write_file_contents(source + ext, ''.join(src_lines)) write_file_contents(target + ext, ''.join(tgt_lines))
def analyze(co_duplicated, src_duplicated, tgt_duplicated): co_linenos = set() for key in src_duplicated.keys(): # the same line is marked as duplicated in target if key not in tgt_duplicated: continue source, target = src_duplicated[key], tgt_duplicated[key] src, src_linenos = source['subject'], source['linenos'] tgt, tgt_linenos = target['subject'], target['linenos'] for lineno in sorted(src_linenos): if lineno not in tgt_linenos: continue co_linenos.add(lineno) if key in co_duplicated: co_duplicated[key]['linenos'].append(lineno) else: co_duplicated[key] = { 'source': src, 'target': tgt, 'linenos': [lineno] } tgt_linenos.remove(lineno) co_duplicated['linenos'] = co_linenos it_print(message.format('common', len(co_linenos)))
def main(args): if args.subject is None: args.subject = ['Neural Machine Translation'] key = args.data if key not in index or args.reload: file_name = key[key.rfind('/') + 1:] save_path = concat_path(dblp_data_path, file_name) # save data contents = request(key) if python3 and isinstance(contents, bytes): contents = str(contents, encoding='utf-8') write_file_contents(save_path, contents) # update index index[key] = save_path index_data = to_json(index, indent=2) write_file_contents(index_path, index_data) # update cache if key not in paper_cache['values']: paper_cache['values'][key] = retrieve_paper_titles( index[key], **{'source': 'dblp'}) write_json_contents(paper_cache_path, paper_cache) filtered, _ = filter_paper_titles(paper_cache['values'][key], args.subject) for i, item in enumerate(filtered): it_print('{}: {}'.format(i + 1, item))
def setup_env(args): if args.scripts_path is None: scripts_path = '{}/vendor/subword-nmt'.format(get_project_root()) setattr(args, 'scripts_path', scripts_path) if not os.path.isdir(args.scripts_path): it_print('file path {} does not exist'.format(args.scripts_path)) exit(0) if os.path.isdir('{}/subword_nmt'.format(args.scripts_path)): scripts_path = '{}/subword_nmt'.format(args.scripts_path) setattr(args, 'scripts_path', scripts_path)
def check_file_paths(file_paths): line_cnt = file_lines(file_paths[0]) for file_path in file_paths: if not file_exists(file_path): it_print('file path [{}] does not exists.'.format(file_path)) exit(0) cnt = file_lines(file_path) if line_cnt != cnt: it_print('file lines mismatch: {} => {}.'.format(line_cnt, cnt)) exit(0) return line_cnt
def parse_duplicated(dup_path, prefix=''): duplicated = OrderedDict() dup_lines = load_file_contents(dup_path) for line in dup_lines: if python2: lineno, dno, subject = line.split(' ', 2) else: lineno, dno, subject = line.split(maxsplit=2) lineno = int(lineno) if dno in duplicated: duplicated[dno]['linenos'].append(lineno) else: duplicated[dno] = { 'subject': subject, 'linenos': [lineno], } it_print(message.format(prefix, len(duplicated))) return duplicated
def main(args): it_print(vars(args), json_fmt=True, indent=2) with io.open(args.file_path, mode='r', encoding='utf-8') as rfp: lines = [x.strip() for x in rfp.readlines()] total_count = len(lines) filtered_count, total_length, filtered_items = 0, 0, [] blank_count, max_seq_length = 0, 0 for i, line in enumerate(lines): length = len(line.split() if args.split else line) total_length += length if length == 0: blank_count += 1 if length > max_seq_length: max_seq_length = length criteria = length > args.max_length if criteria: filtered_count += 1 filtered_items.append((i, length, line)) params = { 'total_lines': total_count, 'blank_count': blank_count, 'max_seq_length': max_seq_length, 'filtered_count': filtered_count, 'satisfied_count': total_count - filtered_count, 'max_length': args.max_length, 'average_length': total_length / total_count } message = ( '{total_lines} lines total, ' '{blank_count} lines are blank, ' '{filtered_count} lines longer than {max_length} symbols, ' '{satisfied_count} lines in range, ' 'max sequence length is {max_seq_length}, ' 'average line length {average_length:.2f}' ) it_print(message.format(**params)) if filtered_count == 0 or not args.verbose: return it_print('lines are listed below: ') for index, length, line in filtered_items: it_print('{lineno}/{length}: {line}'.format(**{ 'lineno': index + 1, 'length': length, 'line': line }))
def main(args): src_paths, tgt_paths = args.source_paths, args.target_paths src_logic, tgt_logic = args.source_logic_and, args.target_logic_and src_constraint = refine_constraint(args.source_constraint) tgt_constraint = refine_constraint(args.target_constraint, src_constraint) # check paths src_cnt = check_file_paths(src_paths) tgt_cnt = check_file_paths(tgt_paths) if src_cnt != tgt_cnt: it_print('file lines mismatch: {} => {}.'.format(src_cnt, tgt_cnt)) exit(0) t_src_paths = [build_suffix(x, src_constraint) for x in src_paths] t_tgt_paths = [build_suffix(x, tgt_constraint) for x in tgt_paths] with open_files(*src_paths, mode='r') as src_rfps, \ open_files(*tgt_paths, mode='r') as tgt_rfps, \ open_files(*t_src_paths, mode='w') as src_wfps, \ open_files(*t_tgt_paths, mode='w') as tgt_wfps: line_cnt, kept_cnt = 0, 0 while line_cnt < src_cnt: line_cnt += 1 src_lines = [rfp.readline() for rfp in src_rfps] tgt_lines = [rfp.readline() for rfp in tgt_rfps] if not check_constraint(src_lines, src_constraint, src_logic): continue if not check_constraint(tgt_lines, tgt_constraint, tgt_logic): continue [wfp.write(line) for wfp, line in zip(src_wfps, src_lines)] [wfp.write(line) for wfp, line in zip(tgt_wfps, tgt_lines)] kept_cnt += 1 it_print('kept lines: {}/{}'.format(kept_cnt, line_cnt)) it_print('filter job done.')
def list_keys(args): version = index.pop('version') it_print('version: {}'.format(version)) keys = filter_keys(index, args.sub_key) if len(keys) > 0: it_print('data keys:') for key in keys: it_print(key, indent=2)
def main(args): it_print(vars(args), json_fmt=True, indent=2) if args.codes_path is None: codes_path = '{}.bpe.{}.codes'.format(args.file_path, args.operations) setattr(args, 'codes_path', codes_path) else: setattr(args, 'keep_codes', True) if args.output_path is None: output_path = '{}.bpe.{}'.format(args.file_path, args.operations) setattr(args, 'output_path', output_path) # set subword-nmt path setup_env(args) # learn bpe if not os.path.exists(args.codes_path): learn_bpe(args) # apply bpe apply_bpe(args) if not args.keep_codes: os.remove(args.codes_path)
def manage_cache(args): if args.delete is not None: c_key = args.delete if c_key in query_cache: query_cache.pop(c_key) write_json_contents(query_cache_path, query_cache) version = query_cache.pop('version') it_print('version: {}'.format(version)) keys = sorted(query_cache.keys()) if len(keys) > 0: it_print('cached keys:') for _key in keys: it_print(_key, indent=2)
query_cache_path = concat_path(cache_path, 'queries.json') cache_size = 80 dblp_data_path = concat_path(data_path, 'dblp') create_if_not_exists(dblp_data_path) # initialize and load index if not file_exists(index_path): data = {'version': 0.1} write_json_contents(index_path, data) index = parse_json(load_file_contents(index_path, pieces=False)) index = OrderedDict(index) # initialize and load paper_cache if not file_exists(paper_cache_path): it_print('building papers cache ...') data = {'version': 0.1, 'build_time': current_datetime(), 'values': {}} # build cache for key, value in index.items(): if not isinstance(value, string_types): continue kwargs = {'source': None} if value.startswith(dblp_data_path): kwargs['source'] = 'dblp' data['values'][key] = retrieve_paper_titles(value, **kwargs) write_json_contents(paper_cache_path, data) paper_cache = parse_json(load_file_contents(paper_cache_path, pieces=False)) # initialize and load query_cache if not file_exists(query_cache_path):
def cached_query(args): c_key = '{sub_key}.{mode}+{subject}-{exclude_subject}'.format( **{ 'sub_key': 'All' if args.all else args.sub_key, 'mode': args.mode, 'subject': '/'.join(args.subject) if args.subject is not None else 'All', 'exclude_subject': '/'.join(args.exclude_subject) if args. exclude_subject is not None else 'None' }) if c_key not in query_cache or args.force: if c_key in query_cache: last = query_cache[c_key]['time'] else: last = None if not args.all: filtered = filter_keys(index, args.sub_key) else: filtered = index.keys() paper_titles = [] total = 0 cached_paper_titles = paper_cache['values'] for key in filtered: if key not in cached_paper_titles: continue titles_holder = cached_paper_titles[key] part, num = filter_paper_titles(titles_holder, args.subject, args.exclude_subject, logic_and=args.mode == 'and') total += num if args.verbose: it_print('{:2} => {}'.format(len(part), key)) if len(part) > 0: paper_titles.append({'key': key, 'titles': part}) # LRU c_keys = list(query_cache.keys()) if len(c_keys) >= cache_size: tmp = dict() for k, v in query_cache.items(): if not isinstance(v, Iterable): continue if 'time' not in v: continue tmp[v['time']] = k removed = sorted(tmp.values())[cache_size - 1:] for k in removed: if k in query_cache: query_cache.pop(k) query_cache[c_key] = { 'paper_titles': paper_titles, 'total': total, 'time': current_datetime() } else: v = query_cache[c_key] paper_titles, total, last = v['paper_titles'], v['total'], v['time'] v['time'] = current_datetime() write_json_contents(query_cache_path, query_cache) if last is not None: it_print('last accessed: {}'.format(last)) it_print('total {} papers'.format(total)) if not len(paper_titles) > 0: it_print('no paper is found') return it_print('paper search result:') for i, item in enumerate(paper_titles, start=1): key, titles = item['key'], item['titles'] it_print('({}) {} -> {}'.format(i, key, len(titles)), indent=2) for j, title in enumerate(titles, start=1): it_print('{}: {}'.format(j, title), indent=4) it_print('total {} papers'.format(total))
if len(keys) > 0: it_print('cached keys:') for _key in keys: it_print(_key, indent=2) def main(args): if args.subject is not None: for i, subject in enumerate(args.subject): args.subject[i] = subject.lower() if args.exclude_subject is not None: for i, subject in enumerate(args.exclude_subject): args.exclude_subject[i] = subject.lower() if args.sub_key is None: args.all = True if args.list_keys: list_keys(args) elif args.query: cached_query(args) elif args.cached: manage_cache(args) if __name__ == '__main__': cmd_str = ' '.join(sys.argv[1:]) if '\'' in cmd_str: it_print('please use double quote instead of quote.') exit(0) main(parse_arguments())
def main(args): token_report = '{} tokens with diversity {}' sequence_report = '{} sequences with unique {}' count, vocab = 0, set() total, seqs = 0, set() def dup2line(x): return '{} {} {}'.format(*x) for file_path in args.file_paths: _vocab_stat, _seq_stat, duplicated = count_file_tokens(file_path) (_count, _vocab) = _vocab_stat (_total, _seqs) = _seq_stat count += _count vocab |= _vocab total += _total seqs |= _seqs filename = basename(file_path) if args.verbose: it_print('{} statistics: '.format(filename), indent=2) it_print(token_report.format(_count, len(_vocab)), indent=4) it_print(sequence_report.format(_total, len(_seqs)), indent=4) if args.output_duplicated: dup_path = '{}.dup'.format(filename) write_iterable_contents(dup_path, duplicated, obj2line_func=dup2line) it_print('corpora statistics: ') it_print(token_report.format(count, len(vocab)), indent=2) it_print(sequence_report.format(total, len(seqs)), indent=2)