def _compare(config, delay_output, pair): """ Compare a pair of files and return either None if they are equal or too similar, or a tuple of (path1, path2, distance) otherwise """ file1, file2 = pair Logger.progress("Comparing {} and {}...".format(file1.relative_path, file2.relative_path)) if FileComparator.are_equal(file1, file2): return if config.compute_distance: distance = FilesetComparator.compute_distance(file1, file2) else: distance = None if distance is not None and distance < config.min_dist: # Ignore files that are too similar return edit = (file1, file2, distance) # Start printing files if we can, so user doesn't have to wait too long if not delay_output: output_change(edit, config) return edit
def compare_files(file_set1, file_set2, data_folder_1, data_folder_2, config): # Print info about the compared files Logger.output("Directory1: {}\nDirectory2: {}".format( data_folder_1, data_folder_2)) # Make sure this is flushed before multiprocessing starts, otherwise # it may be written multiple times # Passing "flush=True" is not enough if the output target is a file Logger.flush_output() comparator = FilesetComparator(files1, files2, config) pairs = comparator.get_files_to_compare() # When sorting , every value has to be computed before starting printing delay_output = False if config.sort_order.lower() == "distance": delay_output = True elif config.sort_order.lower() == "path": delay_output = True # Build a partial func by passing config and delay output, so the result # can be used by pool.map func = partial(_compare, config, delay_output) # Use lock so lines aren't mixed up in output lock = multiprocessing.Lock() with multiprocessing.Pool(config.jobs, initializer=_init_process, initargs=(lock, )) as pool: edits = pool.map(func, pairs) edits = [edit for edit in edits if edit is not None] # If necessary, sort and then output the result Logger.progress("Generating output...") if config.sort_order.lower() == "distance": edits.sort(key=operator.itemgetter(2), reverse=True) elif config.sort_order.lower() == "path": edits.sort(key=operator.itemgetter(0), reverse=True) if delay_output: for edit in edits: output_change(edit, config) # Print info about the added and removed files added_count = 0 for added in comparator.added_files: added_count += 1 mime = "{} ({})".format(added.type["full"], added.type["mime"]) Logger.output("\nAdded: {}\nMime: {}".format(added.path, mime)) removed_count = 0 for removed in comparator.removed_files: removed_count += 1 mime = "{} ({})".format(removed.type["full"], removed.type["mime"]) Logger.output("\nRemoved: {}\nMime: {}".format(removed.path, mime)) # Print overall statistics Logger.info( "\nFound {} added files, {} removed files and {} changed files ({} files in total)" .format(added_count, removed_count, len(edits), added_count + removed_count + len(edits)))
def make_comparator(config, path1, path2): files1, files2 = get_files(config, path1, path2) return FilesetComparator(files1, files2, config)