def other_main(args): """The "real" main function of the "other" mode.""" if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) batch_prefixes = find_all_batches(args.input_dir) logging.info('Found a total of {} batches in {}.'.format( len(batch_prefixes), args.input_dir)) batches_to_subtract = find_all_batches(args.cross_dir) logging.info( 'Found a total of {} batches in {}to deduplicate against.'.format( len(batches_to_subtract), args.cross_dir)) with ProcessPoolExecutor(max_workers=args.processes) as executor: f = partial(deduplicate_other, batches_to_subtract=batches_to_subtract, output_dir=args.output_dir, threshold=args.threshold, permutations=args.permutations) original_doc_num, final_doc_num = 0, 0 for new_num, old_num in executor.map(f, batch_prefixes): original_doc_num += old_num final_doc_num += new_num logging.info('Cross deduplication done; in all, kept ' '{} documents out of {}.'.format(final_doc_num, original_doc_num))
def self_main(args): """The "real" main function of the "self" mode.""" working_dir = op.join(args.output_dir, 'self') if not os.path.isdir(working_dir): os.makedirs(working_dir) batch_prefixes = find_all_batches(args.input_dir) logging.info('Found a total of {} batches in {}.'.format( len(batch_prefixes), args.input_dir)) # First, deduplicate documents _within_ the same batch original_doc_num, self_doc_num, final_doc_num = 0, 0, 0 with Pool(args.processes) as pool: f = partial(deduplicate_self, output_dir=working_dir, threshold=args.threshold, permutations=args.permutations) for new_num, old_num in pool.map(f, batch_prefixes): original_doc_num += old_num self_doc_num += new_num pool.close() pool.join() logging.info('Self deduplication done; in all, kept ' '{} documents out of {}.'.format(self_doc_num, original_doc_num)) # Now, we need to do the deduplication between batches. The idea here is # to load one batch into memory, and delete all documents from it that are # also present in any of the other batches (more precisely, we only need to # do the upper triangle matrix: batch b_i is deduplicated with batches b_j, # where j > i). # At this point, we do all work in output_dir. # Yes, there is no need to send the last batch through this round, except # for counting final_doc_num. batch_prefixes = find_all_batches(working_dir) batches_to_subtract = [ find_all_batches(working_dir, int(op.basename(file_prefix))) for file_prefix in batch_prefixes ] with ProcessPoolExecutor(max_workers=args.processes) as executor: f = partial(deduplicate_other, output_dir=args.output_dir, threshold=args.threshold, permutations=args.permutations) final_doc_num = sum( num for num, _ in executor.map(f, batch_prefixes, batches_to_subtract)) logging.info('Full deduplication done; in all, kept ' '{} documents out of {}.'.format(final_doc_num, original_doc_num)) # Let's delete the intermediate directory. shutil.rmtree(working_dir)
def main(): args = parse_arguments() logging.basicConfig( level=getattr(logging, args.log_level.upper()), format='%(asctime)s - %(process)s - %(levelname)s - %(message)s') os.nice(20) if not op.isdir(args.output_dir): os.makedirs(args.output_dir) input_batches = [ batch_prefix for input_dir in args.input_dirs for batch_prefix in find_all_batches(input_dir) ] logging.info('Found a total of {} input batches.'.format( len(input_batches))) logging.info('Writing files to {}...'.format(args.output_dir)) batch_size = args.batch_size if not args.keep_sizes else sys.maxsize with closing(BatchWriter(batch_size, args.output_dir, args.zeroes)) as bw: for input_batch in input_batches: if not args.keep_sizes: logging.info('Reading batch {}...'.format(input_batch)) for input_file, results in read_batch(input_batch): bw.write_results(input_file, results) else: logging.info('Copying batch {}...'.format(input_batch)) bw.copy_file(input_batch) logging.info('Done; renumbered {} documents.'.format(bw.total_written))
def main(): args = parse_arguments() logging.basicConfig( level=getattr(logging, args.log_level.upper()), format='%(asctime)s - %(process)s - %(levelname)s - %(message)s') os.nice(20) if not os.path.isdir(args.output_dir): os.makedirs(args.output_dir) batch_prefixes = find_all_batches(args.minhash_dir) logging.info('Found a total of {} batches.'.format(len(batch_prefixes))) with Pool(args.processes) as pool: f = partial(deduplicate_batch_documents, output_dir=args.output_dir, input_dir=args.input_dir, ignore_missing_files=args.ignore_missing_files) kept, total = 0, 0 for batch_kept, batch_total in pool.imap(f, batch_prefixes): kept += batch_kept total += batch_total pool.close() pool.join() logging.info('Done.') logging.info('Kept {} documents out of {} in total'.format(kept, total))
def deduplicate_other_old(file_prefix, input_dir, output_dir, threshold, permutations): """ Removes all documents from a set of minhashed documents (3 files with the same minhash prefix) that occur in other batches in input_dir. Only batches whose number is higher than the batch in question are considered (i.e. upper triangular matrix). Warning: only works for full documents at this point! """ lsh = MinHashLSH(threshold=threshold, num_perm=permutations) file_base = op.basename(file_prefix) logging.info('Processing batch {}...'.format(file_base)) # First, load the (already deduplicated) batch... for input_file, results in read_batch(file_prefix): for doc_id, minhash in zip(results['id'], results['minhash']): lsh.insert('\t'.join(doc_id), minhash) initial_len = len(lsh.keys) to_match_with = find_all_batches(input_dir, int(file_prefix.rpartition(os.sep)[-1])) # Now, remove all documents in it that are contained in other batches # to the "right" of it (with greater batch numbers) for batch in to_match_with: initial_batch_len = len(lsh.keys) for _, results in read_batch(batch): for i, minhash in enumerate(results['minhash']): for duplicate in lsh.query(minhash): lsh.remove(duplicate) logging.info( 'Cross-deduplicated batch {} with batch {}: {} -> {} documents.'. format(file_base, op.basename(batch), initial_batch_len, len(lsh.keys))) # Finally, we print the documents left. Unfortunately, in order to # keep the format, we have to read the original batch again. with closing( BatchWriter(sys.maxsize, output_dir, len(file_base), int(file_base))) as bw: # OK, we need to re-read the batch unfortunately for input_file, results in read_batch(file_prefix): doc_ids, minhashes = [], [] for doc_id, minhash in zip(results['id'], results['minhash']): if '\t'.join(doc_id) in lsh: doc_ids.append(doc_id) minhashes.append(minhash) bw.write_results(input_file, {'id': doc_ids, 'minhash': minhashes}) logging.info('Processed batch {}; kept {} out of {} documents.'.format( file_base, len(lsh.keys), initial_len)) return len(lsh.keys), initial_len