def rewrite_file(input_file, input_dir, output_dir, extension): ipath = op.join(input_dir, input_file) opath = op.join(output_dir, '{}.{}'.format( op.splitext(input_file)[0], extension)) with openall(ipath, 'rt') as inf, openall(opath, 'wt') as outf: for line in inf: outf.write(line)
def collect_documents(minhash_prefix, lines): block_lines = 0 # The first line that falls in the current document file next_line = 0 # The index of the next line in the list lines with openall(minhash_prefix + '.files') as filef: with openall(minhash_prefix + '.doc_ids') as linef: for doc_file, num_lines, _, offset in (l.strip().split() for l in filef): num_lines, offset = int(num_lines), int(offset) # Let's find the last line that is still in the current file last_line = next_line while ( last_line < len(lines) and block_lines <= lines[last_line] < block_lines + num_lines ): last_line += 1 if last_line != next_line: docs_to_extract = {} # There are such lines. Let's read them! linef.seek(int(offset)) for i, url in enumerate(linef, start=block_lines + 1): if i == lines[next_line]: docs_to_extract[url.strip()] = i next_line += 1 if next_line == last_line: break yield from extract_documents(docs_to_extract, doc_file) block_lines += num_lines assert next_line == len(lines)
def filter_file(file_name: str, input_dir: str, output_dir: str, allowed_mimes: Set[str], bad_indexp: Pattern): input_file = os.path.join(input_dir, file_name) output_file = os.path.join(output_dir, file_name) with openall(input_file) as inf, openall(output_file, 'wt') as outf: it = read_fields(inf) it = basic_filter(it) it = mime_filter(it, allowed_mimes) it = http_filter(it) if bad_indexp: it = bad_index_filter(it, bad_indexp) for fields in sorted(it, key=lambda f: f[:2]): print(' '.join(fields), file=outf)
def main(): args = parse_arguments() logging.basicConfig( level=getattr(logging, args.log_level.upper()), format='%(asctime)s - %(process)s - %(levelname)s - %(message)s') install_mp_handler() os.nice(20) files = sorted(collect_inputs([args.input_dir])) logging.info('Found a total of {} input files.'.format(len(files))) out = openall(args.output_file, 'wt') if args.output_file else sys.stdout attributes = list(map(str.lower, args.attributes)) if args.write_headers: print('\t'.join(attributes), file=out) with Pool(args.processes) as pool: f = partial(extract_attrs_fields, attrs=attributes) for lines in pool.map(f, files): for attrs in lines: print('\t'.join(attrs), file=out) pool.close() pool.join() if out != sys.stdout: out.close() logging.info('Done.')
def read_files_into_set(files): ret = set() for f in files: with openall(f, 'rt') as inf: for line in map(str.strip, inf): ret.add(line) return ret
def remove_same_ps(input_file: str, min_length: int, output_dir: str) -> Dict[str, CollectStats]: """Removes duplicate paragraphs from documents.""" stats = {} with openall(op.join(output_dir, op.basename(input_file)), 'wt') as outf: for doc in parse_file(input_file, True, False, True): domain = urlsplit(doc.attrs['url']).netloc stat = stats.setdefault(domain, CollectStats()) stat.docs += 1 stat.ps += len(doc.paragraphs) seen_ps, kept_ps = set(), [] for p in doc.paragraphs: if p not in seen_ps: seen_ps.add(p) kept_ps.append(p) else: stat.ps_copies += 1 if len(doc.paragraphs) != len(kept_ps): stat.affected_docs += 1 doc.paragraphs = kept_ps print(doc, file=outf) return stats
def reparse(input_file: str, output_dir: str, attrs: bool, meta: bool, content: bool, **meta_fields: bool): logging.debug(f'Reparsing file {input_file}...') with openall(op.join(output_dir, op.basename(input_file)), 'wt') as outf: for doc in parse_file(input_file, attrs, meta, content, **meta_fields): print(doc, file=outf) logging.debug(f'Reparsed file {input_file}.')
def process_file(filename, input_dir, output_dir): input_file = os.path.join(input_dir, filename) output_file = os.path.join(output_dir, filename) logging.info('Processing file {}...'.format(filename)) stats = Counter() with notempty(openall(output_file, 'wt')) as outf: header, it = headtail(parse_file(input_file, True)) num_fields = len(header) do_wsafter = 'wsafter' not in header if do_wsafter: header.insert(1, 'wsafter') logging.debug('Adding the wsafter field...') print('\t'.join(header), file=outf) for document in it: stats['documents'] += 1 try: stats['token_errors'] += fix_invalid_lines(document, num_fields) if do_wsafter: add_wsafter(document) except ValueError: logging.exception(f'Error in file {input_file}') raise print(document, file=outf) return stats
def parse_file(log_file: str, min_level: int) -> Generator[Tuple[str, str], None, None]: """Filters log messages in *log_file*.""" files = {} pids = defaultdict(list) with openall(log_file, 'rb') as inf: for raw_line in inf: try: line = raw_line.decode('utf-8') except UnicodeDecodeError as ude: # Most likely quntoken errors continue m = startp.match(line) if m: files[m.group('filename')] = m.group('pid') continue m = endp.match(line) if m: if m.group('pid') in pids: yield m.group('filename'), pids[m.group('pid')] del pids[m.group('pid')] del files[m.group('filename')] continue m = logp.match(line) if m and getattr(logging, m.group('level')) >= min_level: pids[m.group('pid')].append(line) continue # Files whose processing could not finish for file_name, pid in files.items(): yield file_name, pids[pid]
def urls_from_log(log_file, warc_file): warc_m = re.match(r'(.+?)_\d+\.warc(\.gz)$', warc_file) if warc_m: warc_name = warc_m.group(1) + warc_m.group(2) else: warc_name = warc_file urls = [] start_p = re.compile(r' - (\d+) - INFO - Processing (.+?)...$') url_p = re.compile(r" - (\d+) - INFO - Nothing's left of (.+?) after " r"boilerplate removal") end_p = re.compile(r' - (\d+) - INFO - Processed (.+?)...$') catching = False with openall(log_file) as inf: for line in inf: if not catching: ms = start_p.search(line) if ms and ms.group(2) == warc_name: catching = ms.group(1) else: mu = url_p.search(line) if mu and mu.group(1) == catching: urls.append(mu.group(2)) else: me = end_p.search(line) if me and me.group(1) == catching: catching = False break return urls
def consumer(output_files: List[str], queue: Queue, header: str, documents: int, num_readers: Value, lock: Lock) -> int: """ Reads :class:`Document`s from the shared queue and writes them to one of the output files at random. :param output_files: list of output file names. :param queue: the queue shared with all processes. :param header: the header of the tsv files. Written to all output files. :param documents: the number of documents to write to an output file. :param num_readers: a shared variable holding the number of readers that are still active. This function exits if two conditions are met: the queue is empty and *num_readers* is 0. :param lock: a lock that regulates access to *num_readers*. :returns: the number of documents written. """ logging.info(f'Consumer started with {len(output_files)} files.') output_names = [os.path.basename(f) for f in output_files] outfs = [notempty(openall(f, 'wt')) for f in output_files] written = [0 for _ in outfs] docs_written = 0 # Write the header for outf in outfs: print(header, file=outf) while outfs: i = random.randint(0, len(outfs) - 1) try: print(queue.get(timeout=5), file=outfs[i]) written[i] += 1 docs_written += 1 if docs_written % 1000 == 0: logging.debug( f'Consumer has written {docs_written} documents.') if written[i] == documents: logging.info(f'Written {documents} documents to ' f'{output_names[i]}; closing...') outfs[i].close() del outfs[i] del written[i] del output_names[i] except Empty: with lock: if num_readers.value == 0: logging.info('Timeout waiting for queue; cleaning up...') break except: logging.exception(f'Exception writing {output_names[i]}!') sys.exit(3) # Close any dangling output files for i in range(len(outfs)): logging.info(f'Written {written[i]} documents to ' f'{output_names[i]}; closing...') outfs[i].close() logging.info(f'Consumer finished; written {docs_written} documents.') return docs_written
def read_names(names_file): """ Reads the names of the documents and returns a list of _their hashes_. The reason we return a hash instead of the actual URL is to minimize the memory consumption; for checking equality, a hash should suffice. """ with openall(names_file) as inf: return [hash(line.strip().split('\t', 1)[0]) for line in inf]
def file_to_dict(index_file: str, keep: str, skip_urls: UrlList, url_fn: UrlFn, global_uniqs: UrlIndexDict): """ Collects all URLs from an index file and deduplicats in two phrases: 1. First, the index lines / URLs are deduplicated inside the file, in case an index file contains the same URL twice (not likely, but who knows?) 2. Then, the URLs are deduplicated across all files / processes. To achieve this, we use a shared dictionary kept in-memory. :param index_file: the name of the input file :param keep: which record should win: the ``latest`` or ``biggest`` :param skip_urls: the list of URLs to skip (e.g. because we already have them) :param url_fn: the URL transformation function to apply to each URL. In the scope of this program, this is either hashing or nothing. :param global_uniqs: the shared dictionary of unique URLs """ logging.info('Collecting URLs from {}...'.format(index_file)) stats = CollectStats() try: # In-file deduplication with openall(index_file, 'rt') as inf: uniqs = {} # type: UrlIndexDict file_id = file_name_p.search(index_file).group(1) line_no = 0 for line_no, line in enumerate(map(str.strip, inf), start=1): try: # After filtering, the line is prepended with the "domain" # I skip that and extract it myself url, warc, offset, length = line.split()[:7][-6:-2] if url in skip_urls: stats.skipped += 1 else: record = IndexRecord(warc, offset, length, file_id) uniq_record(url_fn(url), record, uniqs, keep) except: logging.exception('Exception in file {}:{}'.format( index_file, line_no)) break if line_no == 0: logging.info( 'File {} is empty; returning...'.format(index_file)) return logging.info( 'Self-deduplicated {} URLs in {} to {}; skipped {}.'.format( line_no, index_file, len(uniqs), stats.skipped)) # Global deduplication for url, record in uniqs.items(): stats[uniq_record(url, record, global_uniqs, keep)] += 1 logging.info('Cross-deduplicated {} URLs in {} to ' '{} (overwrote {}; {} new).'.format( len(uniqs), index_file, stats.new + stats.overwrite, stats.overwrite, stats.new)) except: logging.exception('Exception in file {}'.format(index_file))
def analyze_tsv_file(input_file: str, output_file: str, max_sentence_length: int = sys.maxsize): logging.info('Analyzing tsv {}...'.format(input_file)) from __init__ import build_pipeline # Install xtsv warning & error logging filter, so that we know where the # problem happens xtsv_filter = XtsvFilter() logging.getLogger().handlers[0].addFilter(xtsv_filter) # So that we know that everything is filtered assert len(logging.getLogger().handlers) == 1 lemma_col = None try: with openall(input_file) as inf, openall(output_file, 'wt') as outf: xtsv_filter.set(input_file, '<?>', '<?>') last_prog = build_pipeline(inf, used_tools, inited_tools, {}, True) for rline in last_prog: outf.write(rline) # Try to identify the lemma column if lemma_col is None: try: lemma_col = rline.rstrip('\n').split('\t').index( 'lemma') except ValueError: pass break for rline in last_prog: # The other part of the no-lemma handling code if lemma_col: fields = rline.rstrip('\n').split('\t') if len(fields) > 1 and not fields[lemma_col]: fields[lemma_col] = fields[0] # form print('\t'.join(fields), file=outf) else: # Marginally faster without the join outf.write(rline) else: outf.write(rline) logging.info('Finished {}.'.format(input_file)) except: logging.exception('Error in file {}!'.format(input_file))
def new_file(self): """Closes the old file and opens a new one.""" self.close() self.batch += 1 new_file = os.path.join( self.out_dir, '{}{{:0{}}}.txt.gz'.format(self.name_prefix, self.zeroes).format(self.batch)) logging.debug('Opening file {}...'.format(new_file)) self.outf = openall(new_file, 'wt')
def read_bad_index(bad_index_file: str) -> Pattern: """ Reads the bad index file and returns a regex pattern that encompasses all the individual patterns in the file. """ if bad_index_file: with openall(bad_index_file) as inf: return re.compile('^{}$'.format('|'.join( '(?:{})'.format(line.strip()) for line in inf))) else: return None
def process_index_file(s3: Any, filename: str, output_dir: str, retries: int, rotate_info: Tuple): """ Processes an index file: downloads all URLs in it. This functions is basically a wrapper to :func:`process_stream`. ``filename`` is the name of the index file; the rest of the parameters are the same as for :func:`process_stream`. """ logging.info('Starting file {}...'.format(filename)) with openall(filename) as inpfh: process_stream(s3, ('{} {}'.format(filename, line) for line in inpfh), output_dir, retries, rotate_info) logging.info('Finished file {}.'.format(filename))
def parse_file( tsv_file: str, use_headers: bool = True ) -> Generator[Union[List, Document], None, None]: """ Same as :func:`parse`, but expects the name of a file as the first argument. """ try: with openall(tsv_file) as inf: yield from parse(inf, use_headers) except IllegalStateError as ise: ise.args = f'{ise.args[0][:6]}in file {tsv_file} {ise.args[0][6:]}', raise ise
def filter_file(input_file, output_file, uniqs, url_fn: UrlFn) -> FilterStats: """ Filters an index file; i.e. drops all duplicate URLs. :param input_file: the input index file :param output_file: the output index file :param uniqs: the shared dictionary of unique URLs :param url_fn: the URL transformation function to apply to each URL. In the scope of this program, this is either hashing or nothing. """ logging.info('Filtering file {}...'.format(input_file)) stats = FilterStats(old_files=1) with openall(input_file, 'rt') as inf, notempty(openall(output_file, 'wt')) as outf: line_no = lines_printed = 0 for line_no, line in enumerate(map(str.strip, inf), start=1): try: url, warc, offset, length = line.split()[:7][-6:-2] record = IndexRecord(warc, offset, length) if record == uniqs.get(url_fn(url)): print(line, file=outf) lines_printed += 1 except: logging.exception('Exception in file {}:{}'.format( input_file, line_no)) if line_no: logging.info('Kept {} URLs out of {} in {}.'.format( lines_printed, line_no, input_file)) stats.old_urls = line_no else: logging.info('File {} was empty.'.format(input_file)) if lines_printed: stats.new_files = 1 stats.new_urls = lines_printed return stats
def collect_lines_from_file(line_file, head, *extra_lines): """ Collects approximately the first head line numbers from the file. Also appends to the returned list all additional lines in *extra_lines. """ lines = [] lines_set = set() with openall(line_file) as inf: for new_lines in ([int(e) for e in l.strip().split()] for l in inf): new_lines = [line for line in new_lines if line not in lines_set] lines += new_lines lines_set.update(new_lines) if len(lines) >= head: break lines += [int(line) for line in extra_lines if line not in lines_set] return lines
def read_group_documents(group: Iterator[str]) -> Iterator[Document]: """Returns an iterator of the documents in a group.""" last_file = None f = None try: for line in group: _, doc_file, doc_pos, doc_len = line.split('\t') if doc_file != last_file: if f: f.close() f = openall(doc_file, 'rb') last_file = doc_file f.seek(int(doc_pos)) yield from parse(f.read(int(doc_len)).decode('utf-8').split('\n')) finally: if f: f.close()
def main_distribute(args): """The main function for distributing the index file.""" weights = [weight for _, weight in args.hosts] hosts = [ openall(host_to_path(args.index, host), 'wt') for host, _ in args.hosts ] lens = [0 for _ in weights] try: for _, group in group_index(read_index(args.index)): i = lens.index(min(lens)) # argmin logging.debug('Adding {} items to host {} ({}).'.format( len(group), i, hosts[i].name)) for line in group: print(line, file=hosts[i]) # Higher weight means "I need more documents" lens[i] += len(group) / weights[i] finally: for i, host in enumerate(hosts): logging.info('Wrote {} lines to {}.'.format( round(lens[i] * weights[i]), host.name)) host.close()
def deduplicate_batch_documents(batch_prefix, output_dir, input_dir=None, ignore_missing_files=False): """ Filters documents not present in the batch and writes the filtered corpus files to output_dir. As above, input_dir can be specified if the location information in the batch files is outdated. Empty files will not be written. """ batch_base = op.basename(batch_prefix) logging.info('Filtering batch {}...'.format(batch_base)) kept, total = 0, 0 num_files = 0 for input_file, results in read_batch(batch_prefix): file_base = op.basename(input_file) url_set = set('_'.join(doc_id) for doc_id in results['id']) input_file = op.join(input_dir, file_base) if input_dir else input_file if os.path.isfile(input_file): with notempty(openall(op.join(output_dir, file_base), 'wt')) as outf: for doc_no, doc in enumerate(parse_file(input_file), start=1): if doc.attrs['url'] in url_set: print(doc, file=outf) kept += 1 total += doc_no num_files += 1 elif ignore_missing_files: logging.debug( 'Input file {} was not found; ignoring...'.format(input_file)) else: raise FileNotFoundError( 'Input file {} not found.'.format(input_file)) logging.info('Filtered batch {} of {} files; ' 'kept {} documents out of {}.'.format(batch_base, num_files, kept, total)) return kept, total
def main_index_documents(args): """The main function for indexing documents.""" input_files = [ op.join(input_dir, f) for input_dir in args.input_dirs for f in os.listdir(input_dir) ] logging.info('Found a total of {} input files.'.format(len(input_files))) index = [] with Pool(args.processes) as pool: f = partial(index_file) for input_file, urls_poss_lens in pool.imap(f, input_files): for doc_url, doc_pos, doc_len in urls_poss_lens: index.append((doc_url, input_file, doc_pos, doc_len)) pool.close() pool.join() index.sort(key=index_key) with openall(args.index, 'wt') as outf: for domain, group in groupby( index, lambda record: urlsplit(record[0]).netloc): urls_written = set() for doc_url, doc_file, doc_pos, doc_len in group: # This also filters http:// + https:// variants try: pure_url = doc_url[doc_url.find('://') + 3:] if pure_url not in urls_written: urls_written.add(pure_url) print(doc_url, doc_file, doc_pos, doc_len, sep='\t', file=outf) logging.debug('Printed URL {}.'.format(doc_url)) else: logging.debug( 'Skipped duplicate URL {}.'.format(doc_url)) except: logging.exception('Error somewhere!!!')
def process_file(filename, input_dir, output_dir, languages, language_unit, min_len_str, keep_urls=None, drop_urls=None): input_file = os.path.join(input_dir, filename) output_file = os.path.join(output_dir, filename) logging.info('Processing file {}...'.format(filename)) stats = Counter() it = parse_file(input_file, True, True, True) it = each_doc(it, stats) if languages: if language_unit == 'doc': it = filter_languages_doc(it, languages, stats) else: it = filter_languages_p(it, languages, stats) if min_len_str: it = filter_length(it, min_len_str, stats) if drop_urls: # Get the right list: from the Manager or the local one url_list = drop_urls if drop_urls.__class__.__name__ == 'DictProxy' \ else urls_to_drop # noqa it = filter_urls(it, url_list, stats) if keep_urls: # Get the right list: from the Manager or the local one url_list = keep_urls if keep_urls.__class__.__name__ == 'DictProxy' \ else urls_to_keep # noqa it = retain_urls(it, url_list, stats) try: with notempty(openall(output_file, 'wt')) as outf: for doc in it: print(doc, file=outf) except: logging.exception('Got an error.') logging.info('Finished processing file {}...'.format(filename)) return stats
def main(): args = parse_arguments() if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) if args.urls: urls = urls_from_file(args.urls) else: urls = urls_from_log(args.log_file, args.warc) for record in warc.open(args.warc): url = record.header['WARC-Target-URI'] if url in urls: page = record.payload.read().split(b'\r\n\r\n', maxsplit=1)[1] file_name = os.path.join( args.output_dir, url.rstrip(os.path.sep).replace(os.path.sep, '_')) if args.output_dir: with openall(file_name, 'wb') as outf: outf.write(page) else: print(page)
def read_urls(urls_file: str, url_fn: UrlFn) -> UrlSet: """ Reads URLS from the file ``urls_file``, one per line. The URLs are returned in a set; either as a string or as a hash value, depending on what the transformation function ``url_fn`` does. Using hashes instead of the full url can conserve memory. In our experiments, we have not encountered collisions yet. Note: no normalization of URLs for now, as the library that I tried was slooooooooow. This also means that versions of the same URL might stay in the index, including http / https versions. Hopefully, document deduplication will take care of this. """ with openall(urls_file) as inf: urls = set() for no_urls, url in enumerate(map(str.strip, inf), start=1): urls.add(url_fn(url)) if no_urls % 5000000 == 0: logging.debug('Loaded {} urls from {}...'.format( no_urls, urls_file)) logging.info('Loaded {} urls from {}; {} unique.'.format( no_urls, urls_file, len(urls))) return urls
def read_allowed_mimes(allowed_mimes_file: str) -> Set[str]: """Reads the allowed mimes list.""" with openall(allowed_mimes_file) as inf: return set(line.strip() for line in inf)
def read_index(index_file: str) -> Iterator[str]: """Reads the index file. Not really necessary, but oh well.""" with openall(index_file) as inf: yield from map(str.strip, inf)
def get_url(self, f): with openall(f) as inf: for line in inf: m = self.p.match(line) if m: return m.group(1)