Ejemplo n.º 1
0
def main():
    args = parse_arguments()

    logging.basicConfig(
        level=getattr(logging, args.log_level.upper()),
        format='%(asctime)s - %(process)s - %(levelname)s - %(message)s'
    )
    install_mp_handler()

    os.nice(20)
    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    files = sorted(collect_inputs(args.inputs))
    logging.info('Found a total of {} input files.'.format(len(files)))

    with closing(
        BatchWriter(args.batch_size, args.output_dir, args.zeroes)
    ) as writer:
        with Pool(args.processes) as pool:
            minhash_fun = minhash_ps if args.unit == 'p' else minhash_docs
            f = partial(minhash_fun, permutations=args.permutations, n=args.n)
            for input_file, results in pool.imap_unordered(f, files):
                logging.debug('Got results for {}: {}'.format(
                    input_file, len(results['minhash'])))
                writer.write_results(input_file, results)

            pool.close()
            pool.join()
        logging.info('Done.')

    logging.info('Hashed in total {} paragraphs.'.format(writer.total_written))
Ejemplo n.º 2
0
def main():
    args = parse_arguments()

    logging.basicConfig(
        level=getattr(logging, args.log_level.upper()),
        format='%(asctime)s - %(process)s - %(levelname)s - %(message)s')
    install_mp_handler()

    os.nice(20)

    files = collect_inputs(args.inputs)
    logging.info(f'Scheduled {len(files)} files for finding top {args.n} '
                 f'{"lower cased " if args.lower else ""} values in column '
                 f'{args.column}...')

    with Pool(args.processes) as p:
        c_all = Counter()
        f = partial(process_file,
                    column=args.column,
                    n=args.n,
                    lower=args.lower)
        for c in p.imap_unordered(f, files):
            c_all.update(c)
        for key, freq in sorted(c_all.most_common(args.n),
                                key=lambda kv: (-kv[1], kv[0])):
            print(f'{key}\t{freq}')

    logging.info('Done.')
Ejemplo n.º 3
0
def main():
    args = parse_arguments()

    logging.basicConfig(
        level=getattr(logging, args.log_level.upper()),
        format='%(asctime)s - %(process)s - %(levelname)s - %(message)s')
    install_mp_handler()

    os.nice(20)

    files = collect_inputs(args.inputs)
    count_fn = count_file if not args.warc else count_warc_file
    logging.info('Scheduled {} files for counting...'.format(len(files)))
    with Pool(args.processes) as p:
        f = partial(count_fn,
                    docs=args.documents,
                    ps=args.paragraphs,
                    words=args.words,
                    chars=args.characters)
        stats = [0, 0, 0, 0]
        for sub_stats in p.map(f, files):
            for i in range(len(stats)):
                stats[i] += sub_stats[i]

        fields = [args.documents, args.paragraphs, args.words, args.characters]
        if args.latex:
            print(' & ' + ' & '.join('{:,d}'.format(stat) if field else ''
                                     for stat, field in zip(stats, fields)) +
                  r' \\')
        else:
            print(' '.join(
                str(stat) for stat, field in zip(stats, fields) if field))
        p.close()
        p.join()
    logging.info('Done.')
Ejemplo n.º 4
0
def main():
    args = parse_arguments()

    logging.basicConfig(
        level=getattr(logging, args.log_level.upper()),
        format='%(asctime)s - %(process)s - %(levelname)s - %(message)s')
    install_mp_handler()

    os.nice(20)
    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    files = sorted(collect_inputs(args.inputs))
    logging.info('Scheduled {} files for reparsing.'.format(len(files)))

    with Pool(args.processes) as pool:
        f = partial(reparse,
                    output_dir=args.output_dir,
                    attrs=args.attrs,
                    meta=args.meta,
                    content=args.content,
                    meta_fields={field: True
                                 for field in args.meta_fields})
        consume(
            otqdm(pool.imap_unordered(f, files),
                  desc='Reparsing corpus files...',
                  total=len(files)))

        pool.close()
        pool.join()
    logging.info('Done.')
Ejemplo n.º 5
0
def main():
    args = parse_arguments()

    logging.basicConfig(
        level=getattr(logging, args.log_level.upper()),
        format='%(asctime)s - %(process)s - %(levelname)s - %(message)s')
    install_mp_handler()

    os.nice(20)
    files = sorted(collect_inputs([args.input_dir]))
    logging.info('Found a total of {} input files.'.format(len(files)))

    out = openall(args.output_file, 'wt') if args.output_file else sys.stdout

    attributes = list(map(str.lower, args.attributes))
    if args.write_headers:
        print('\t'.join(attributes), file=out)

    with Pool(args.processes) as pool:
        f = partial(extract_attrs_fields, attrs=attributes)
        for lines in pool.map(f, files):
            for attrs in lines:
                print('\t'.join(attrs), file=out)
        pool.close()
        pool.join()

    if out != sys.stdout:
        out.close()

    logging.info('Done.')
Ejemplo n.º 6
0
def main():
    args = parse_arguments()

    logging.basicConfig(
        level=getattr(logging, args.log_level.upper()),
        format='%(asctime)s - %(process)s - %(levelname)s - %(message)s')
    install_mp_handler()

    os.nice(20)
    if not op.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    input_files = sorted(collect_inputs(args.inputs))
    logging.info('Found a total of {} input files.'.format(len(input_files)))

    output_files = [
        op.join(args.output_dir, output_file_name(f, args.extension))
        for f in input_files
    ]

    with Pool(args.processes,
              initializer=start_emtsv,
              initargs=[args.emtsv_dir, args.tasks]) as pool:

        f = partial(
            analyze_file if args.file_format == 'text' else analyze_tsv_file,
            max_sentence_length=args.max_sentence_length)
        pool.starmap(f, zip(input_files, output_files))
        logging.debug('Joining processes...')
        pool.close()
        pool.join()
        logging.debug('Joined processes.')

    logging.info('Done.')
Ejemplo n.º 7
0
def main():
    args = parse_arguments()
    min_level = getattr(logging, args.log_level.upper())
    create_patterns(args.start, args.end)

    input_files = sorted(collect_inputs(args.inputs))
    for input_file in input_files:
        try:
            for file_name, log_lines in parse_file(input_file, min_level):
                print(f'{file_name}:')
                for line in log_lines:
                    print(line)
                print()
        except:
            print(f'Error in file {input_file}!', file=sys.stderr)
            raise
Ejemplo n.º 8
0
def main():
    args = parse_arguments()

    logging.basicConfig(
        level=getattr(logging, args.log_level.upper()),
        format='%(asctime)s - %(process)s - %(levelname)s - %(message)s')
    install_mp_handler()

    input_files = collect_inputs([args.input_dir])
    logging.info('Scheduled {} files for processing.'.format(len(input_files)))

    if args.command == 'statistics':
        f = partial(collect_stats, min_length=args.min_length)
    else:
        f = partial(remove_same_ps,
                    min_length=args.min_length,
                    output_dir=args.output_dir)
        if not op.isdir(args.output_dir):
            os.makedirs(args.output_dir)

    with Pool(args.processes) as pool:
        sum_stats = defaultdict(CollectStats)
        for stats in pool.imap_unordered(f, input_files):
            for domain, stat in stats.items():
                sum_stats[domain] += stat
        pool.close()
        pool.join()

    if args.command == 'statistics':
        for domain, stat in sorted(sum_stats.items()):
            if stat.affected_docs > 0:
                print('{}\t{}\t{}\t{}\t{}\t{}'.format(domain, stat.docs,
                                                      stat.ps,
                                                      stat.affected_docs,
                                                      stat.affected_ps,
                                                      stat.ps_copies))
    else:
        sum_stat = CollectStats()
        for stat in sum_stats.values():
            sum_stat += stat
        logging.info('Filtered {} paragraphs from {} affected documents '
                     '(out of {} in {}).'.format(sum_stat.ps_copies,
                                                 sum_stat.affected_docs,
                                                 sum_stat.ps, sum_stat.docs))

    logging.info('Done.')
def main():
    args = parse_arguments()

    logging.basicConfig(
        level=getattr(logging, args.log_level.upper()),
        format='%(asctime)s - %(process)s - %(levelname)s - %(message)s')

    os.nice(20)
    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    input_files = collect_inputs(args.input_dirs)
    logging.info('Scheduled {} files for renumbering.'.format(
        len(input_files)))

    batch_size = args.documents if not args.keep_sizes else sys.maxsize
    num_docs = 0
    with closing(BatchWriter(batch_size, args.output_dir, args.zeroes)) as bw:
        for input_file in input_files:
            if not args.keep_sizes:
                logging.debug('Reading file {}...'.format(input_file))
                for document in parse_file(input_file):
                    bw.write(document)
                    num_docs += 1
            else:
                logging.debug('Copying file {}...'.format(input_file))
                bw.copy_file(input_file)

    if not args.keep_sizes:
        logging.info('Done. Renumbered {} files to {}, {} documents each; '
                     '{} documents in total.'.format(len(input_files),
                                                     args.output_dir,
                                                     args.documents, num_docs))
    else:
        logging.info('Done. Renumbered {} files to {}.'.format(
            len(input_files), args.output_dir))
Ejemplo n.º 10
0
def main():
    args = parse_arguments()

    logging.basicConfig(
        level=getattr(logging, args.log_level.upper()),
        format='%(asctime)s - %(process)s - %(levelname)s - %(message)s')
    install_mp_handler()

    os.nice(20)
    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    input_files = collect_inputs(args.input_dirs)
    logging.info('Scheduled {} files for shuffling.'.format(len(input_files)))
    if not input_files:
        logging.critical('No input files!')
        sys.exit(1)

    output_files = [
        os.path.join(args.output_dir, os.path.basename(f)) for f in input_files
    ]

    with openall(input_files[0]) as inf:
        header = inf.readline().strip()

    with Pool(args.processes) as inpool, Pool(args.processes) as outpool:
        m = Manager()
        queue = m.Queue(maxsize=1000)
        num_readers = m.Value('I', args.processes)
        lock = m.Lock()

        # Each worker gets a chunk of all input / output files
        input_chunks = list(split_into(input_files, args.processes))
        output_chunks = list(split_into(output_files, args.processes))

        producer_f = partial(producer,
                             queue=queue,
                             num_readers=num_readers,
                             lock=lock)
        inresult = inpool.map_async(producer_f, input_chunks)
        consumer_f = partial(consumer,
                             queue=queue,
                             header=header,
                             documents=args.documents,
                             num_readers=num_readers,
                             lock=lock)
        outresult = outpool.map_async(consumer_f, output_chunks)

        docs_read, docs_written = sum(inresult.get()), sum(outresult.get())

        logging.debug('Joining processes...')
        inpool.close()
        outpool.close()
        inpool.join()
        outpool.join()
        logging.debug('Joined processes.')

        if docs_read != docs_written:
            logging.error(f'The number of documents read ({docs_read}) and '
                          f'the number of documents written ({docs_written}) '
                          f'differs!')

    logging.info('Done.')