def process_file(filename, input_dir, output_dir): input_file = os.path.join(input_dir, filename) output_file = os.path.join(output_dir, filename) logging.info('Processing file {}...'.format(filename)) stats = Counter() with notempty(openall(output_file, 'wt')) as outf: header, it = headtail(parse_file(input_file, True)) num_fields = len(header) do_wsafter = 'wsafter' not in header if do_wsafter: header.insert(1, 'wsafter') logging.debug('Adding the wsafter field...') print('\t'.join(header), file=outf) for document in it: stats['documents'] += 1 try: stats['token_errors'] += fix_invalid_lines(document, num_fields) if do_wsafter: add_wsafter(document) except ValueError: logging.exception(f'Error in file {input_file}') raise print(document, file=outf) return stats
def consumer(output_files: List[str], queue: Queue, header: str, documents: int, num_readers: Value, lock: Lock) -> int: """ Reads :class:`Document`s from the shared queue and writes them to one of the output files at random. :param output_files: list of output file names. :param queue: the queue shared with all processes. :param header: the header of the tsv files. Written to all output files. :param documents: the number of documents to write to an output file. :param num_readers: a shared variable holding the number of readers that are still active. This function exits if two conditions are met: the queue is empty and *num_readers* is 0. :param lock: a lock that regulates access to *num_readers*. :returns: the number of documents written. """ logging.info(f'Consumer started with {len(output_files)} files.') output_names = [os.path.basename(f) for f in output_files] outfs = [notempty(openall(f, 'wt')) for f in output_files] written = [0 for _ in outfs] docs_written = 0 # Write the header for outf in outfs: print(header, file=outf) while outfs: i = random.randint(0, len(outfs) - 1) try: print(queue.get(timeout=5), file=outfs[i]) written[i] += 1 docs_written += 1 if docs_written % 1000 == 0: logging.debug( f'Consumer has written {docs_written} documents.') if written[i] == documents: logging.info(f'Written {documents} documents to ' f'{output_names[i]}; closing...') outfs[i].close() del outfs[i] del written[i] del output_names[i] except Empty: with lock: if num_readers.value == 0: logging.info('Timeout waiting for queue; cleaning up...') break except: logging.exception(f'Exception writing {output_names[i]}!') sys.exit(3) # Close any dangling output files for i in range(len(outfs)): logging.info(f'Written {written[i]} documents to ' f'{output_names[i]}; closing...') outfs[i].close() logging.info(f'Consumer finished; written {docs_written} documents.') return docs_written
def process_file(filename, input_dir, output_dir, languages, language_unit, min_len_str, keep_urls=None, drop_urls=None): input_file = os.path.join(input_dir, filename) output_file = os.path.join(output_dir, filename) logging.info('Processing file {}...'.format(filename)) stats = Counter() it = parse_file(input_file, True, True, True) it = each_doc(it, stats) if languages: if language_unit == 'doc': it = filter_languages_doc(it, languages, stats) else: it = filter_languages_p(it, languages, stats) if min_len_str: it = filter_length(it, min_len_str, stats) if drop_urls: # Get the right list: from the Manager or the local one url_list = drop_urls if drop_urls.__class__.__name__ == 'DictProxy' \ else urls_to_drop # noqa it = filter_urls(it, url_list, stats) if keep_urls: # Get the right list: from the Manager or the local one url_list = keep_urls if keep_urls.__class__.__name__ == 'DictProxy' \ else urls_to_keep # noqa it = retain_urls(it, url_list, stats) try: with notempty(openall(output_file, 'wt')) as outf: for doc in it: print(doc, file=outf) except: logging.exception('Got an error.') logging.info('Finished processing file {}...'.format(filename)) return stats
def deduplicate_batch_documents(batch_prefix, output_dir, input_dir=None, ignore_missing_files=False): """ Filters documents not present in the batch and writes the filtered corpus files to output_dir. As above, input_dir can be specified if the location information in the batch files is outdated. Empty files will not be written. """ batch_base = op.basename(batch_prefix) logging.info('Filtering batch {}...'.format(batch_base)) kept, total = 0, 0 num_files = 0 for input_file, results in read_batch(batch_prefix): file_base = op.basename(input_file) url_set = set('_'.join(doc_id) for doc_id in results['id']) input_file = op.join(input_dir, file_base) if input_dir else input_file if os.path.isfile(input_file): with notempty(openall(op.join(output_dir, file_base), 'wt')) as outf: for doc_no, doc in enumerate(parse_file(input_file), start=1): if doc.attrs['url'] in url_set: print(doc, file=outf) kept += 1 total += doc_no num_files += 1 elif ignore_missing_files: logging.debug( 'Input file {} was not found; ignoring...'.format(input_file)) else: raise FileNotFoundError( 'Input file {} not found.'.format(input_file)) logging.info('Filtered batch {} of {} files; ' 'kept {} documents out of {}.'.format(batch_base, num_files, kept, total)) return kept, total
def filter_file(input_file, output_file, uniqs, url_fn: UrlFn) -> FilterStats: """ Filters an index file; i.e. drops all duplicate URLs. :param input_file: the input index file :param output_file: the output index file :param uniqs: the shared dictionary of unique URLs :param url_fn: the URL transformation function to apply to each URL. In the scope of this program, this is either hashing or nothing. """ logging.info('Filtering file {}...'.format(input_file)) stats = FilterStats(old_files=1) with openall(input_file, 'rt') as inf, notempty(openall(output_file, 'wt')) as outf: line_no = lines_printed = 0 for line_no, line in enumerate(map(str.strip, inf), start=1): try: url, warc, offset, length = line.split()[:7][-6:-2] record = IndexRecord(warc, offset, length) if record == uniqs.get(url_fn(url)): print(line, file=outf) lines_printed += 1 except: logging.exception('Exception in file {}:{}'.format( input_file, line_no)) if line_no: logging.info('Kept {} URLs out of {} in {}.'.format( lines_printed, line_no, input_file)) stats.old_urls = line_no else: logging.info('File {} was empty.'.format(input_file)) if lines_printed: stats.new_files = 1 stats.new_urls = lines_printed return stats