Example #1
0
def main(args):
    source, target, output = args.source, args.target, args.output_path
    assert all([file_exists(x) for x in [source, target]])

    src_contents = load_file_contents(source)
    tgt_contents = load_file_contents(target)
    src_cnt, tgt_cnt = len(src_contents), len(tgt_contents)
    assert src_cnt <= tgt_cnt
    it_print('source {} lines, target {} lines'.format(src_cnt, tgt_cnt))

    if output is None:
        output = 'mapping.json'

    src_mapping = OrderedDict({k + 1: src_contents[k] for k in range(src_cnt)})
    tgt_mapping = {k + 1: tgt_contents[k] for k in range(tgt_cnt)}

    mapping = OrderedDict()
    with tqdm(total=src_cnt, disable=not args.verbose) as pb:
        src_keys = list(sorted(src_mapping.keys()))
        for key in src_keys:
            sub, value = None, src_mapping[key]
            for sub in tgt_mapping:
                if value == tgt_mapping[sub]:
                    mapping[key] = sub
                    break
            if sub is not None:
                it_print('{} -> {}'.format(key, sub))
                src_mapping.pop(key)
                tgt_mapping.pop(sub)
            pb.update(1)

    write_file_contents(output, to_json(mapping, indent=2))
    write_file_contents('source.left.json', to_json(src_mapping, indent=2))
    write_file_contents('target.left.json', to_json(tgt_mapping, indent=2))
Example #2
0
def execute(source, target, co_duplicated, ext='.dedup', verbose=False):
    duplicated = co_duplicated.pop('linenos')
    src_contents = load_file_contents(source, strip=False)
    tgt_contents = load_file_contents(target, strip=False)
    total = len(src_contents)
    assert total == len(tgt_contents)

    src_lines, tgt_lines = [], []
    iterator = zip(src_contents, tgt_contents)
    for lineno, (src, tgt) in enumerate(iterator, start=1):
        if verbose and lineno % 10000 == 0:
            it_print('processed {}'.format(lineno))
        if lineno in duplicated:
            duplicated.remove(lineno)
            continue
        # write to file
        src_lines.append(src)
        tgt_lines.append(tgt)

    count = len(src_lines)
    assert count == len(tgt_lines)

    it_print('total {} lines, after filter, {} left'.format(total, count))
    write_file_contents(source + ext, ''.join(src_lines))
    write_file_contents(target + ext, ''.join(tgt_lines))
Example #3
0
def analyze(co_duplicated, src_duplicated, tgt_duplicated):
    co_linenos = set()
    for key in src_duplicated.keys():
        # the same line is marked as duplicated in target
        if key not in tgt_duplicated:
            continue

        source, target = src_duplicated[key], tgt_duplicated[key]
        src, src_linenos = source['subject'], source['linenos']
        tgt, tgt_linenos = target['subject'], target['linenos']
        for lineno in sorted(src_linenos):
            if lineno not in tgt_linenos:
                continue

            co_linenos.add(lineno)
            if key in co_duplicated:
                co_duplicated[key]['linenos'].append(lineno)
            else:
                co_duplicated[key] = {
                    'source': src,
                    'target': tgt,
                    'linenos': [lineno]
                }
            tgt_linenos.remove(lineno)
    co_duplicated['linenos'] = co_linenos
    it_print(message.format('common', len(co_linenos)))
Example #4
0
def main(args):
    if args.subject is None:
        args.subject = ['Neural Machine Translation']

    key = args.data
    if key not in index or args.reload:
        file_name = key[key.rfind('/') + 1:]
        save_path = concat_path(dblp_data_path, file_name)

        # save data
        contents = request(key)
        if python3 and isinstance(contents, bytes):
            contents = str(contents, encoding='utf-8')
        write_file_contents(save_path, contents)

        # update index
        index[key] = save_path
        index_data = to_json(index, indent=2)
        write_file_contents(index_path, index_data)

        # update cache
        if key not in paper_cache['values']:
            paper_cache['values'][key] = retrieve_paper_titles(
                index[key], **{'source': 'dblp'})
        write_json_contents(paper_cache_path, paper_cache)

    filtered, _ = filter_paper_titles(paper_cache['values'][key], args.subject)
    for i, item in enumerate(filtered):
        it_print('{}: {}'.format(i + 1, item))
Example #5
0
def setup_env(args):
    if args.scripts_path is None:
        scripts_path = '{}/vendor/subword-nmt'.format(get_project_root())
        setattr(args, 'scripts_path', scripts_path)
    if not os.path.isdir(args.scripts_path):
        it_print('file path {} does not exist'.format(args.scripts_path))
        exit(0)
    if os.path.isdir('{}/subword_nmt'.format(args.scripts_path)):
        scripts_path = '{}/subword_nmt'.format(args.scripts_path)
        setattr(args, 'scripts_path', scripts_path)
Example #6
0
def check_file_paths(file_paths):
    line_cnt = file_lines(file_paths[0])
    for file_path in file_paths:
        if not file_exists(file_path):
            it_print('file path [{}] does not exists.'.format(file_path))
            exit(0)
        cnt = file_lines(file_path)
        if line_cnt != cnt:
            it_print('file lines mismatch: {} => {}.'.format(line_cnt, cnt))
            exit(0)
    return line_cnt
Example #7
0
def parse_duplicated(dup_path, prefix=''):
    duplicated = OrderedDict()
    dup_lines = load_file_contents(dup_path)
    for line in dup_lines:
        if python2:
            lineno, dno, subject = line.split(' ', 2)
        else:
            lineno, dno, subject = line.split(maxsplit=2)
        lineno = int(lineno)
        if dno in duplicated:
            duplicated[dno]['linenos'].append(lineno)
        else:
            duplicated[dno] = {
                'subject': subject,
                'linenos': [lineno],
            }

    it_print(message.format(prefix, len(duplicated)))
    return duplicated
Example #8
0
def main(args):
    it_print(vars(args), json_fmt=True, indent=2)

    with io.open(args.file_path, mode='r', encoding='utf-8') as rfp:
        lines = [x.strip() for x in rfp.readlines()]
        total_count = len(lines)

    filtered_count, total_length, filtered_items = 0, 0, []
    blank_count, max_seq_length = 0, 0
    for i, line in enumerate(lines):
        length = len(line.split() if args.split else line)
        total_length += length

        if length == 0:
            blank_count += 1

        if length > max_seq_length:
            max_seq_length = length

        criteria = length > args.max_length
        if criteria:
            filtered_count += 1
            filtered_items.append((i, length, line))
    params = {
        'total_lines': total_count,
        'blank_count': blank_count,
        'max_seq_length': max_seq_length,
        'filtered_count': filtered_count,
        'satisfied_count': total_count - filtered_count,
        'max_length': args.max_length,
        'average_length': total_length / total_count
    }
    message = (
        '{total_lines} lines total, '
        '{blank_count} lines are blank, '
        '{filtered_count} lines longer than {max_length} symbols, '
        '{satisfied_count} lines in range, '
        'max sequence length is {max_seq_length}, '
        'average line length {average_length:.2f}'
    )
    it_print(message.format(**params))

    if filtered_count == 0 or not args.verbose:
        return

    it_print('lines are listed below: ')
    for index, length, line in filtered_items:
        it_print('{lineno}/{length}: {line}'.format(**{
            'lineno': index + 1,
            'length': length,
            'line': line
        }))
Example #9
0
def main(args):
    src_paths, tgt_paths = args.source_paths, args.target_paths
    src_logic, tgt_logic = args.source_logic_and, args.target_logic_and
    src_constraint = refine_constraint(args.source_constraint)
    tgt_constraint = refine_constraint(args.target_constraint, src_constraint)

    # check paths
    src_cnt = check_file_paths(src_paths)
    tgt_cnt = check_file_paths(tgt_paths)
    if src_cnt != tgt_cnt:
        it_print('file lines mismatch: {} => {}.'.format(src_cnt, tgt_cnt))
        exit(0)

    t_src_paths = [build_suffix(x, src_constraint) for x in src_paths]
    t_tgt_paths = [build_suffix(x, tgt_constraint) for x in tgt_paths]
    with open_files(*src_paths, mode='r') as src_rfps, \
            open_files(*tgt_paths, mode='r') as tgt_rfps, \
            open_files(*t_src_paths, mode='w') as src_wfps, \
            open_files(*t_tgt_paths, mode='w') as tgt_wfps:
        line_cnt, kept_cnt = 0, 0
        while line_cnt < src_cnt:
            line_cnt += 1
            src_lines = [rfp.readline() for rfp in src_rfps]
            tgt_lines = [rfp.readline() for rfp in tgt_rfps]
            if not check_constraint(src_lines, src_constraint, src_logic):
                continue
            if not check_constraint(tgt_lines, tgt_constraint, tgt_logic):
                continue

            [wfp.write(line) for wfp, line in zip(src_wfps, src_lines)]
            [wfp.write(line) for wfp, line in zip(tgt_wfps, tgt_lines)]
            kept_cnt += 1

    it_print('kept lines: {}/{}'.format(kept_cnt, line_cnt))
    it_print('filter job done.')
Example #10
0
def list_keys(args):
    version = index.pop('version')
    it_print('version: {}'.format(version))
    keys = filter_keys(index, args.sub_key)
    if len(keys) > 0:
        it_print('data keys:')
        for key in keys:
            it_print(key, indent=2)
Example #11
0
def main(args):
    it_print(vars(args), json_fmt=True, indent=2)
    if args.codes_path is None:
        codes_path = '{}.bpe.{}.codes'.format(args.file_path, args.operations)
        setattr(args, 'codes_path', codes_path)
    else:
        setattr(args, 'keep_codes', True)

    if args.output_path is None:
        output_path = '{}.bpe.{}'.format(args.file_path, args.operations)
        setattr(args, 'output_path', output_path)

    # set subword-nmt path
    setup_env(args)

    # learn bpe
    if not os.path.exists(args.codes_path):
        learn_bpe(args)

    # apply bpe
    apply_bpe(args)

    if not args.keep_codes:
        os.remove(args.codes_path)
Example #12
0
def manage_cache(args):
    if args.delete is not None:
        c_key = args.delete
        if c_key in query_cache:
            query_cache.pop(c_key)
            write_json_contents(query_cache_path, query_cache)

    version = query_cache.pop('version')
    it_print('version: {}'.format(version))
    keys = sorted(query_cache.keys())
    if len(keys) > 0:
        it_print('cached keys:')
        for _key in keys:
            it_print(_key, indent=2)
Example #13
0
query_cache_path = concat_path(cache_path, 'queries.json')
cache_size = 80

dblp_data_path = concat_path(data_path, 'dblp')
create_if_not_exists(dblp_data_path)

# initialize and load index
if not file_exists(index_path):
    data = {'version': 0.1}
    write_json_contents(index_path, data)
index = parse_json(load_file_contents(index_path, pieces=False))
index = OrderedDict(index)

# initialize and load paper_cache
if not file_exists(paper_cache_path):
    it_print('building papers cache ...')
    data = {'version': 0.1, 'build_time': current_datetime(), 'values': {}}

    # build cache
    for key, value in index.items():
        if not isinstance(value, string_types):
            continue
        kwargs = {'source': None}
        if value.startswith(dblp_data_path):
            kwargs['source'] = 'dblp'
        data['values'][key] = retrieve_paper_titles(value, **kwargs)
    write_json_contents(paper_cache_path, data)
paper_cache = parse_json(load_file_contents(paper_cache_path, pieces=False))

# initialize and load query_cache
if not file_exists(query_cache_path):
Example #14
0
def cached_query(args):
    c_key = '{sub_key}.{mode}+{subject}-{exclude_subject}'.format(
        **{
            'sub_key':
            'All' if args.all else args.sub_key,
            'mode':
            args.mode,
            'subject':
            '/'.join(args.subject) if args.subject is not None else 'All',
            'exclude_subject':
            '/'.join(args.exclude_subject) if args.
            exclude_subject is not None else 'None'
        })
    if c_key not in query_cache or args.force:
        if c_key in query_cache:
            last = query_cache[c_key]['time']
        else:
            last = None

        if not args.all:
            filtered = filter_keys(index, args.sub_key)
        else:
            filtered = index.keys()

        paper_titles = []
        total = 0
        cached_paper_titles = paper_cache['values']
        for key in filtered:
            if key not in cached_paper_titles:
                continue
            titles_holder = cached_paper_titles[key]
            part, num = filter_paper_titles(titles_holder,
                                            args.subject,
                                            args.exclude_subject,
                                            logic_and=args.mode == 'and')
            total += num
            if args.verbose:
                it_print('{:2} => {}'.format(len(part), key))
            if len(part) > 0:
                paper_titles.append({'key': key, 'titles': part})

        # LRU
        c_keys = list(query_cache.keys())
        if len(c_keys) >= cache_size:
            tmp = dict()
            for k, v in query_cache.items():
                if not isinstance(v, Iterable):
                    continue
                if 'time' not in v:
                    continue
                tmp[v['time']] = k
            removed = sorted(tmp.values())[cache_size - 1:]
            for k in removed:
                if k in query_cache:
                    query_cache.pop(k)
        query_cache[c_key] = {
            'paper_titles': paper_titles,
            'total': total,
            'time': current_datetime()
        }
    else:
        v = query_cache[c_key]
        paper_titles, total, last = v['paper_titles'], v['total'], v['time']
        v['time'] = current_datetime()
    write_json_contents(query_cache_path, query_cache)

    if last is not None:
        it_print('last accessed: {}'.format(last))
    it_print('total {} papers'.format(total))
    if not len(paper_titles) > 0:
        it_print('no paper is found')
        return

    it_print('paper search result:')
    for i, item in enumerate(paper_titles, start=1):
        key, titles = item['key'], item['titles']
        it_print('({}) {} -> {}'.format(i, key, len(titles)), indent=2)
        for j, title in enumerate(titles, start=1):
            it_print('{}: {}'.format(j, title), indent=4)
    it_print('total {} papers'.format(total))
Example #15
0
    if len(keys) > 0:
        it_print('cached keys:')
        for _key in keys:
            it_print(_key, indent=2)


def main(args):
    if args.subject is not None:
        for i, subject in enumerate(args.subject):
            args.subject[i] = subject.lower()
    if args.exclude_subject is not None:
        for i, subject in enumerate(args.exclude_subject):
            args.exclude_subject[i] = subject.lower()
    if args.sub_key is None:
        args.all = True

    if args.list_keys:
        list_keys(args)
    elif args.query:
        cached_query(args)
    elif args.cached:
        manage_cache(args)


if __name__ == '__main__':
    cmd_str = ' '.join(sys.argv[1:])
    if '\'' in cmd_str:
        it_print('please use double quote instead of quote.')
        exit(0)
    main(parse_arguments())
Example #16
0
def main(args):
    token_report = '{} tokens with diversity {}'
    sequence_report = '{} sequences with unique {}'
    count, vocab = 0, set()
    total, seqs = 0, set()

    def dup2line(x):
        return '{} {} {}'.format(*x)

    for file_path in args.file_paths:
        _vocab_stat, _seq_stat, duplicated = count_file_tokens(file_path)
        (_count, _vocab) = _vocab_stat
        (_total, _seqs) = _seq_stat
        count += _count
        vocab |= _vocab
        total += _total
        seqs |= _seqs

        filename = basename(file_path)
        if args.verbose:
            it_print('{} statistics: '.format(filename), indent=2)
            it_print(token_report.format(_count, len(_vocab)), indent=4)
            it_print(sequence_report.format(_total, len(_seqs)), indent=4)

        if args.output_duplicated:
            dup_path = '{}.dup'.format(filename)
            write_iterable_contents(dup_path,
                                    duplicated,
                                    obj2line_func=dup2line)

    it_print('corpora statistics: ')
    it_print(token_report.format(count, len(vocab)), indent=2)
    it_print(sequence_report.format(total, len(seqs)), indent=2)