def rewrite_file(input_file, input_dir, output_dir, extension):
    ipath = op.join(input_dir, input_file)
    opath = op.join(output_dir, '{}.{}'.format(
        op.splitext(input_file)[0], extension))
    with openall(ipath, 'rt') as inf, openall(opath, 'wt') as outf:
        for line in inf:
            outf.write(line)
Beispiel #2
0
def collect_documents(minhash_prefix, lines):
    block_lines = 0  # The first line that falls in the current document file
    next_line = 0  # The index of the next line in the list lines
    with openall(minhash_prefix + '.files') as filef:
        with openall(minhash_prefix + '.doc_ids') as linef:
            for doc_file, num_lines, _, offset in (l.strip().split() for l in filef):
                num_lines, offset = int(num_lines), int(offset)
                # Let's find the last line that is still in the current file
                last_line = next_line
                while (
                    last_line < len(lines) and
                    block_lines <= lines[last_line] < block_lines + num_lines
                ):
                    last_line += 1
                if last_line != next_line:
                    docs_to_extract = {}
                    # There are such lines. Let's read them!
                    linef.seek(int(offset))
                    for i, url in enumerate(linef, start=block_lines + 1):
                        if i == lines[next_line]:
                            docs_to_extract[url.strip()] = i
                            next_line += 1
                            if next_line == last_line:
                                break
                    yield from extract_documents(docs_to_extract, doc_file)
                block_lines += num_lines
    assert next_line == len(lines)
def filter_file(file_name: str, input_dir: str, output_dir: str,
                allowed_mimes: Set[str], bad_indexp: Pattern):
    input_file = os.path.join(input_dir, file_name)
    output_file = os.path.join(output_dir, file_name)
    with openall(input_file) as inf, openall(output_file, 'wt') as outf:
        it = read_fields(inf)
        it = basic_filter(it)
        it = mime_filter(it, allowed_mimes)
        it = http_filter(it)
        if bad_indexp:
            it = bad_index_filter(it, bad_indexp)
        for fields in sorted(it, key=lambda f: f[:2]):
            print(' '.join(fields), file=outf)
Beispiel #4
0
def main():
    args = parse_arguments()

    logging.basicConfig(
        level=getattr(logging, args.log_level.upper()),
        format='%(asctime)s - %(process)s - %(levelname)s - %(message)s')
    install_mp_handler()

    os.nice(20)
    files = sorted(collect_inputs([args.input_dir]))
    logging.info('Found a total of {} input files.'.format(len(files)))

    out = openall(args.output_file, 'wt') if args.output_file else sys.stdout

    attributes = list(map(str.lower, args.attributes))
    if args.write_headers:
        print('\t'.join(attributes), file=out)

    with Pool(args.processes) as pool:
        f = partial(extract_attrs_fields, attrs=attributes)
        for lines in pool.map(f, files):
            for attrs in lines:
                print('\t'.join(attrs), file=out)
        pool.close()
        pool.join()

    if out != sys.stdout:
        out.close()

    logging.info('Done.')
def read_files_into_set(files):
    ret = set()
    for f in files:
        with openall(f, 'rt') as inf:
            for line in map(str.strip, inf):
                ret.add(line)
    return ret
Beispiel #6
0
def remove_same_ps(input_file: str, min_length: int,
                   output_dir: str) -> Dict[str, CollectStats]:
    """Removes duplicate paragraphs from documents."""
    stats = {}
    with openall(op.join(output_dir, op.basename(input_file)), 'wt') as outf:
        for doc in parse_file(input_file, True, False, True):
            domain = urlsplit(doc.attrs['url']).netloc
            stat = stats.setdefault(domain, CollectStats())
            stat.docs += 1
            stat.ps += len(doc.paragraphs)

            seen_ps, kept_ps = set(), []
            for p in doc.paragraphs:
                if p not in seen_ps:
                    seen_ps.add(p)
                    kept_ps.append(p)
                else:
                    stat.ps_copies += 1

            if len(doc.paragraphs) != len(kept_ps):
                stat.affected_docs += 1
                doc.paragraphs = kept_ps
            print(doc, file=outf)

    return stats
Beispiel #7
0
def reparse(input_file: str, output_dir: str, attrs: bool, meta: bool,
            content: bool, **meta_fields: bool):
    logging.debug(f'Reparsing file {input_file}...')
    with openall(op.join(output_dir, op.basename(input_file)), 'wt') as outf:
        for doc in parse_file(input_file, attrs, meta, content, **meta_fields):
            print(doc, file=outf)
    logging.debug(f'Reparsed file {input_file}.')
Beispiel #8
0
def process_file(filename, input_dir, output_dir):
    input_file = os.path.join(input_dir, filename)
    output_file = os.path.join(output_dir, filename)
    logging.info('Processing file {}...'.format(filename))

    stats = Counter()

    with notempty(openall(output_file, 'wt')) as outf:
        header, it = headtail(parse_file(input_file, True))
        num_fields = len(header)
        do_wsafter = 'wsafter' not in header
        if do_wsafter:
            header.insert(1, 'wsafter')
            logging.debug('Adding the wsafter field...')
        print('\t'.join(header), file=outf)
        for document in it:
            stats['documents'] += 1
            try:
                stats['token_errors'] += fix_invalid_lines(document, num_fields)
                if do_wsafter:
                    add_wsafter(document)
            except ValueError:
                logging.exception(f'Error in file {input_file}')
                raise
            print(document, file=outf)
    return stats
Beispiel #9
0
def parse_file(log_file: str,
               min_level: int) -> Generator[Tuple[str, str], None, None]:
    """Filters log messages in *log_file*."""
    files = {}
    pids = defaultdict(list)
    with openall(log_file, 'rb') as inf:
        for raw_line in inf:
            try:
                line = raw_line.decode('utf-8')
            except UnicodeDecodeError as ude:
                # Most likely quntoken errors
                continue
            m = startp.match(line)
            if m:
                files[m.group('filename')] = m.group('pid')
                continue
            m = endp.match(line)
            if m:
                if m.group('pid') in pids:
                    yield m.group('filename'), pids[m.group('pid')]
                    del pids[m.group('pid')]
                del files[m.group('filename')]
                continue
            m = logp.match(line)
            if m and getattr(logging, m.group('level')) >= min_level:
                pids[m.group('pid')].append(line)
                continue
        # Files whose processing could not finish
        for file_name, pid in files.items():
            yield file_name, pids[pid]
def urls_from_log(log_file, warc_file):
    warc_m = re.match(r'(.+?)_\d+\.warc(\.gz)$', warc_file)
    if warc_m:
        warc_name = warc_m.group(1) + warc_m.group(2)
    else:
        warc_name = warc_file

    urls = []
    start_p = re.compile(r' - (\d+) - INFO - Processing (.+?)...$')
    url_p = re.compile(r" - (\d+) - INFO - Nothing's left of (.+?) after "
                       r"boilerplate removal")
    end_p = re.compile(r' - (\d+) - INFO - Processed (.+?)...$')

    catching = False
    with openall(log_file) as inf:
        for line in inf:
            if not catching:
                ms = start_p.search(line)
                if ms and ms.group(2) == warc_name:
                    catching = ms.group(1)
            else:
                mu = url_p.search(line)
                if mu and mu.group(1) == catching:
                    urls.append(mu.group(2))
                else:
                    me = end_p.search(line)
                    if me and me.group(1) == catching:
                        catching = False
                        break

    return urls
Beispiel #11
0
def consumer(output_files: List[str], queue: Queue, header: str,
             documents: int, num_readers: Value, lock: Lock) -> int:
    """
    Reads :class:`Document`s from the shared queue and writes them to one of
    the output files at random.

    :param output_files: list of output file names.
    :param queue: the queue shared with all processes.
    :param header: the header of the tsv files. Written to all output files.
    :param documents: the number of documents to write to an output file.
    :param num_readers: a shared variable holding the number of readers that
                        are still active. This function exits if two conditions
                        are met: the queue is empty and *num_readers* is 0.
    :param lock: a lock that regulates access to *num_readers*.
    :returns: the number of documents written.
    """
    logging.info(f'Consumer started with {len(output_files)} files.')
    output_names = [os.path.basename(f) for f in output_files]
    outfs = [notempty(openall(f, 'wt')) for f in output_files]
    written = [0 for _ in outfs]
    docs_written = 0

    # Write the header
    for outf in outfs:
        print(header, file=outf)

    while outfs:
        i = random.randint(0, len(outfs) - 1)
        try:
            print(queue.get(timeout=5), file=outfs[i])
            written[i] += 1
            docs_written += 1
            if docs_written % 1000 == 0:
                logging.debug(
                    f'Consumer has written {docs_written} documents.')
            if written[i] == documents:
                logging.info(f'Written {documents} documents to '
                             f'{output_names[i]}; closing...')
                outfs[i].close()
                del outfs[i]
                del written[i]
                del output_names[i]
        except Empty:
            with lock:
                if num_readers.value == 0:
                    logging.info('Timeout waiting for queue; cleaning up...')
                    break
        except:
            logging.exception(f'Exception writing {output_names[i]}!')
            sys.exit(3)

    # Close any dangling output files
    for i in range(len(outfs)):
        logging.info(f'Written {written[i]} documents to '
                     f'{output_names[i]}; closing...')
        outfs[i].close()

    logging.info(f'Consumer finished; written {docs_written} documents.')
    return docs_written
Beispiel #12
0
def read_names(names_file):
    """
    Reads the names of the documents and returns a list of _their hashes_. The
    reason we return a hash instead of the actual URL is to minimize the memory
    consumption; for checking equality, a hash should suffice.
    """
    with openall(names_file) as inf:
        return [hash(line.strip().split('\t', 1)[0]) for line in inf]
def file_to_dict(index_file: str, keep: str, skip_urls: UrlList, url_fn: UrlFn,
                 global_uniqs: UrlIndexDict):
    """
    Collects all URLs from an index file and deduplicats in two phrases:

    1. First, the index lines / URLs are deduplicated inside the file, in case
       an index file contains the same URL twice (not likely, but who knows?)
    2. Then, the URLs are deduplicated across all files / processes. To achieve
       this, we use a shared dictionary kept in-memory.

    :param index_file: the name of the input file
    :param keep: which record should win: the ``latest`` or ``biggest``
    :param skip_urls: the list of URLs to skip (e.g. because we already have them)
    :param url_fn: the URL transformation function to apply to each URL. In the
                   scope of this program, this is either hashing or nothing.
    :param global_uniqs: the shared dictionary of unique URLs
    """
    logging.info('Collecting URLs from {}...'.format(index_file))
    stats = CollectStats()
    try:
        # In-file deduplication
        with openall(index_file, 'rt') as inf:
            uniqs = {}  # type: UrlIndexDict
            file_id = file_name_p.search(index_file).group(1)
            line_no = 0
            for line_no, line in enumerate(map(str.strip, inf), start=1):
                try:
                    # After filtering, the line is prepended with the "domain"
                    # I skip that and extract it myself
                    url, warc, offset, length = line.split()[:7][-6:-2]
                    if url in skip_urls:
                        stats.skipped += 1
                    else:
                        record = IndexRecord(warc, offset, length, file_id)
                        uniq_record(url_fn(url), record, uniqs, keep)
                except:
                    logging.exception('Exception in file {}:{}'.format(
                        index_file, line_no))
                    break

            if line_no == 0:
                logging.info(
                    'File {} is empty; returning...'.format(index_file))
                return
            logging.info(
                'Self-deduplicated {} URLs in {} to {}; skipped {}.'.format(
                    line_no, index_file, len(uniqs), stats.skipped))

        # Global deduplication
        for url, record in uniqs.items():
            stats[uniq_record(url, record, global_uniqs, keep)] += 1

        logging.info('Cross-deduplicated {} URLs in {} to '
                     '{} (overwrote {}; {} new).'.format(
                         len(uniqs), index_file, stats.new + stats.overwrite,
                         stats.overwrite, stats.new))
    except:
        logging.exception('Exception in file {}'.format(index_file))
Beispiel #14
0
def analyze_tsv_file(input_file: str,
                     output_file: str,
                     max_sentence_length: int = sys.maxsize):
    logging.info('Analyzing tsv {}...'.format(input_file))
    from __init__ import build_pipeline

    # Install xtsv warning & error logging filter, so that we know where the
    # problem happens
    xtsv_filter = XtsvFilter()
    logging.getLogger().handlers[0].addFilter(xtsv_filter)
    # So that we know that everything is filtered
    assert len(logging.getLogger().handlers) == 1

    lemma_col = None
    try:
        with openall(input_file) as inf, openall(output_file, 'wt') as outf:
            xtsv_filter.set(input_file, '<?>', '<?>')
            last_prog = build_pipeline(inf, used_tools, inited_tools, {}, True)
            for rline in last_prog:
                outf.write(rline)
                # Try to identify the lemma column
                if lemma_col is None:
                    try:
                        lemma_col = rline.rstrip('\n').split('\t').index(
                            'lemma')
                    except ValueError:
                        pass
                break
            for rline in last_prog:
                # The other part of the no-lemma handling code
                if lemma_col:
                    fields = rline.rstrip('\n').split('\t')
                    if len(fields) > 1 and not fields[lemma_col]:
                        fields[lemma_col] = fields[0]  # form
                        print('\t'.join(fields), file=outf)
                    else:
                        # Marginally faster without the join
                        outf.write(rline)
                else:
                    outf.write(rline)
        logging.info('Finished {}.'.format(input_file))
    except:
        logging.exception('Error in file {}!'.format(input_file))
Beispiel #15
0
    def new_file(self):
        """Closes the old file and opens a new one."""
        self.close()

        self.batch += 1
        new_file = os.path.join(
            self.out_dir,
            '{}{{:0{}}}.txt.gz'.format(self.name_prefix,
                                       self.zeroes).format(self.batch))
        logging.debug('Opening file {}...'.format(new_file))
        self.outf = openall(new_file, 'wt')
Beispiel #16
0
def read_bad_index(bad_index_file: str) -> Pattern:
    """
    Reads the bad index file and returns a regex pattern that encompasses all
    the individual patterns in the file.
    """
    if bad_index_file:
        with openall(bad_index_file) as inf:
            return re.compile('^{}$'.format('|'.join(
                '(?:{})'.format(line.strip()) for line in inf)))
    else:
        return None
Beispiel #17
0
def process_index_file(s3: Any, filename: str, output_dir: str, retries: int,
                       rotate_info: Tuple):
    """
    Processes an index file: downloads all URLs in it. This functions is
    basically a wrapper to :func:`process_stream`. ``filename`` is the name of
    the index file; the rest of the parameters are the same as for
    :func:`process_stream`.
    """
    logging.info('Starting file {}...'.format(filename))
    with openall(filename) as inpfh:
        process_stream(s3, ('{} {}'.format(filename, line) for line in inpfh),
                       output_dir, retries, rotate_info)
    logging.info('Finished file {}.'.format(filename))
Beispiel #18
0
def parse_file(
        tsv_file: str,
        use_headers: bool = True
) -> Generator[Union[List, Document], None, None]:
    """
    Same as :func:`parse`, but expects the name of a file as the first argument.
    """
    try:
        with openall(tsv_file) as inf:
            yield from parse(inf, use_headers)
    except IllegalStateError as ise:
        ise.args = f'{ise.args[0][:6]}in file {tsv_file} {ise.args[0][6:]}',
        raise ise
def filter_file(input_file, output_file, uniqs, url_fn: UrlFn) -> FilterStats:
    """
    Filters an index file; i.e. drops all duplicate URLs.
    :param input_file: the input index file
    :param output_file: the output index file
    :param uniqs: the shared dictionary of unique URLs
    :param url_fn: the URL transformation function to apply to each URL. In the
                   scope of this program, this is either hashing or nothing.
    """
    logging.info('Filtering file {}...'.format(input_file))
    stats = FilterStats(old_files=1)
    with openall(input_file, 'rt') as inf, notempty(openall(output_file,
                                                            'wt')) as outf:
        line_no = lines_printed = 0
        for line_no, line in enumerate(map(str.strip, inf), start=1):
            try:
                url, warc, offset, length = line.split()[:7][-6:-2]
                record = IndexRecord(warc, offset, length)
                if record == uniqs.get(url_fn(url)):
                    print(line, file=outf)
                    lines_printed += 1
            except:
                logging.exception('Exception in file {}:{}'.format(
                    input_file, line_no))

    if line_no:
        logging.info('Kept {} URLs out of {} in {}.'.format(
            lines_printed, line_no, input_file))
        stats.old_urls = line_no
    else:
        logging.info('File {} was empty.'.format(input_file))

    if lines_printed:
        stats.new_files = 1
        stats.new_urls = lines_printed
    return stats
Beispiel #20
0
def collect_lines_from_file(line_file, head, *extra_lines):
    """
    Collects approximately the first head line numbers from the file.
    Also appends to the returned list all additional lines in *extra_lines.
    """
    lines = []
    lines_set = set()
    with openall(line_file) as inf:
        for new_lines in ([int(e) for e in l.strip().split()] for l in inf):
            new_lines = [line for line in new_lines if line not in lines_set]
            lines += new_lines
            lines_set.update(new_lines)
            if len(lines) >= head:
                break
    lines += [int(line) for line in extra_lines if line not in lines_set]
    return lines
Beispiel #21
0
def read_group_documents(group: Iterator[str]) -> Iterator[Document]:
    """Returns an iterator of the documents in a group."""
    last_file = None
    f = None
    try:
        for line in group:
            _, doc_file, doc_pos, doc_len = line.split('\t')
            if doc_file != last_file:
                if f:
                    f.close()
                f = openall(doc_file, 'rb')
                last_file = doc_file
            f.seek(int(doc_pos))
            yield from parse(f.read(int(doc_len)).decode('utf-8').split('\n'))
    finally:
        if f:
            f.close()
Beispiel #22
0
def main_distribute(args):
    """The main function for distributing the index file."""
    weights = [weight for _, weight in args.hosts]
    hosts = [
        openall(host_to_path(args.index, host), 'wt') for host, _ in args.hosts
    ]
    lens = [0 for _ in weights]
    try:
        for _, group in group_index(read_index(args.index)):
            i = lens.index(min(lens))  # argmin
            logging.debug('Adding {} items to host {} ({}).'.format(
                len(group), i, hosts[i].name))
            for line in group:
                print(line, file=hosts[i])
            # Higher weight means "I need more documents"
            lens[i] += len(group) / weights[i]
    finally:
        for i, host in enumerate(hosts):
            logging.info('Wrote {} lines to {}.'.format(
                round(lens[i] * weights[i]), host.name))
            host.close()
Beispiel #23
0
def deduplicate_batch_documents(batch_prefix,
                                output_dir,
                                input_dir=None,
                                ignore_missing_files=False):
    """
    Filters documents not present in the batch and writes the filtered corpus
    files to output_dir. As above, input_dir can be specified if the location
    information in the batch files is outdated.

    Empty files will not be written.
    """
    batch_base = op.basename(batch_prefix)
    logging.info('Filtering batch {}...'.format(batch_base))

    kept, total = 0, 0
    num_files = 0
    for input_file, results in read_batch(batch_prefix):
        file_base = op.basename(input_file)
        url_set = set('_'.join(doc_id) for doc_id in results['id'])
        input_file = op.join(input_dir, file_base) if input_dir else input_file
        if os.path.isfile(input_file):
            with notempty(openall(op.join(output_dir, file_base),
                                  'wt')) as outf:
                for doc_no, doc in enumerate(parse_file(input_file), start=1):
                    if doc.attrs['url'] in url_set:
                        print(doc, file=outf)
                        kept += 1
                total += doc_no
            num_files += 1
        elif ignore_missing_files:
            logging.debug(
                'Input file {} was not found; ignoring...'.format(input_file))
        else:
            raise FileNotFoundError(
                'Input file {} not found.'.format(input_file))

    logging.info('Filtered batch {} of {} files; '
                 'kept {} documents out of {}.'.format(batch_base, num_files,
                                                       kept, total))
    return kept, total
Beispiel #24
0
def main_index_documents(args):
    """The main function for indexing documents."""
    input_files = [
        op.join(input_dir, f) for input_dir in args.input_dirs
        for f in os.listdir(input_dir)
    ]

    logging.info('Found a total of {} input files.'.format(len(input_files)))
    index = []
    with Pool(args.processes) as pool:
        f = partial(index_file)
        for input_file, urls_poss_lens in pool.imap(f, input_files):
            for doc_url, doc_pos, doc_len in urls_poss_lens:
                index.append((doc_url, input_file, doc_pos, doc_len))
    pool.close()
    pool.join()

    index.sort(key=index_key)
    with openall(args.index, 'wt') as outf:
        for domain, group in groupby(
                index, lambda record: urlsplit(record[0]).netloc):
            urls_written = set()
            for doc_url, doc_file, doc_pos, doc_len in group:
                # This also filters http:// + https:// variants
                try:
                    pure_url = doc_url[doc_url.find('://') + 3:]
                    if pure_url not in urls_written:
                        urls_written.add(pure_url)
                        print(doc_url,
                              doc_file,
                              doc_pos,
                              doc_len,
                              sep='\t',
                              file=outf)
                        logging.debug('Printed URL {}.'.format(doc_url))
                    else:
                        logging.debug(
                            'Skipped duplicate URL {}.'.format(doc_url))
                except:
                    logging.exception('Error somewhere!!!')
def process_file(filename,
                 input_dir,
                 output_dir,
                 languages,
                 language_unit,
                 min_len_str,
                 keep_urls=None,
                 drop_urls=None):
    input_file = os.path.join(input_dir, filename)
    output_file = os.path.join(output_dir, filename)
    logging.info('Processing file {}...'.format(filename))

    stats = Counter()
    it = parse_file(input_file, True, True, True)
    it = each_doc(it, stats)
    if languages:
        if language_unit == 'doc':
            it = filter_languages_doc(it, languages, stats)
        else:
            it = filter_languages_p(it, languages, stats)
    if min_len_str:
        it = filter_length(it, min_len_str, stats)
    if drop_urls:
        # Get the right list: from the Manager or the local one
        url_list = drop_urls if drop_urls.__class__.__name__ == 'DictProxy' \
                             else urls_to_drop  # noqa
        it = filter_urls(it, url_list, stats)
    if keep_urls:
        # Get the right list: from the Manager or the local one
        url_list = keep_urls if keep_urls.__class__.__name__ == 'DictProxy' \
                             else urls_to_keep  # noqa
        it = retain_urls(it, url_list, stats)
    try:
        with notempty(openall(output_file, 'wt')) as outf:
            for doc in it:
                print(doc, file=outf)
    except:
        logging.exception('Got an error.')
    logging.info('Finished processing file {}...'.format(filename))
    return stats
def main():
    args = parse_arguments()

    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)

    if args.urls:
        urls = urls_from_file(args.urls)
    else:
        urls = urls_from_log(args.log_file, args.warc)

    for record in warc.open(args.warc):
        url = record.header['WARC-Target-URI']
        if url in urls:
            page = record.payload.read().split(b'\r\n\r\n', maxsplit=1)[1]
            file_name = os.path.join(
                args.output_dir,
                url.rstrip(os.path.sep).replace(os.path.sep, '_'))
            if args.output_dir:
                with openall(file_name, 'wb') as outf:
                    outf.write(page)
            else:
                print(page)
def read_urls(urls_file: str, url_fn: UrlFn) -> UrlSet:
    """
    Reads URLS from the file ``urls_file``, one per line. The URLs are
    returned in a set; either as a string or as a hash value,
    depending on what the transformation function ``url_fn`` does.

    Using hashes instead of the full url can conserve memory. In our
    experiments, we have not encountered collisions yet.

    Note: no normalization of URLs for now, as the library that I tried was
    slooooooooow. This also means that versions of the same URL might stay in
    the index, including http / https versions. Hopefully,
    document deduplication will take care of this.
    """
    with openall(urls_file) as inf:
        urls = set()
        for no_urls, url in enumerate(map(str.strip, inf), start=1):
            urls.add(url_fn(url))
            if no_urls % 5000000 == 0:
                logging.debug('Loaded {} urls from {}...'.format(
                    no_urls, urls_file))
        logging.info('Loaded {} urls from {}; {} unique.'.format(
            no_urls, urls_file, len(urls)))
        return urls
Beispiel #28
0
def read_allowed_mimes(allowed_mimes_file: str) -> Set[str]:
    """Reads the allowed mimes list."""
    with openall(allowed_mimes_file) as inf:
        return set(line.strip() for line in inf)
Beispiel #29
0
def read_index(index_file: str) -> Iterator[str]:
    """Reads the index file. Not really necessary, but oh well."""
    with openall(index_file) as inf:
        yield from map(str.strip, inf)
Beispiel #30
0
 def get_url(self, f):
     with openall(f) as inf:
         for line in inf:
             m = self.p.match(line)
             if m:
                 return m.group(1)