def dupgroups(dirs, hashfunc): dups = defaultdict(list) logging.info("Collecting files") for size, path in progress(iter_size_path(dirs)): dups[size].append(path) logging.info("Found %s size groups", len(dups)) logging.info("Filtering files based on size") dups = {k: v for k, v in dups.items() if len(v) > 1} logging.info("Found %s duplicate size groups", len(dups)) logging.info("Calculating hash groups") for size, hashes in progress(iter_size_hashes(dups, hashfunc), length=len(dups)): dups[size] = hashes logging.info("Filtering files based on hashes") newdups = {} for size, sizegroup in dups.items(): for hash, hashgroup in sizegroup.items(): if len(hashgroup) > 1: newdups[(size, hash)] = hashgroup return newdups
def dupgroups_no_size(dirs, hashfunc): dups = defaultdict(list) logging.info("Calculating hash groups") for size, path in progress(iter_size_path(dirs)): try: hash = hashfunc(path) except PermissionError: logging.warning("Permission denied: %s", path) except FileNotFoundError: logging.warning("File not found: %s", path) except Skip: pass else: dups[hash].append((path, size)) logging.info("Found %s hash groups", len(dups)) logging.info("Filtering files based on hash") dups = {k: v for k, v in dups.items() if len(v) > 1} logging.info("Found %s duplicate hash groups", len(dups)) return dups
def test_progress(self): r = range(1000) with open(devnull, "w") as fw: self.assertIterEqual(r, progress(r, file=fw))
help="Directory to store temporary benchmarking databases") parser.add_argument("--version", action="version", version=__version__) parser.add_argument( "--sizes", nargs="+", type=int, metavar="N", default=[10, 100, 10**3, 10**4, 10**5, 10**6], help="Number of records to read/write", ) parser.add_argument("--bestof", type=int, metavar="N", default=3, help="Run N benchmarks") parser.add_argument("--outfile", default="benchmarks.md", help="Benchmark results") args = parser.parse_args() with open(args.outfile, "wt", encoding="utf-8") as fw: results: List[ResultsDict] = [] for _ in progress(range(args.bestof)): results.append(bench(args.outpath, args.sizes)) best_results = merge_results(results) write_markdown_table(fw, best_results, "write") write_markdown_table(fw, best_results, "read")
parser.add_argument( "--type", nargs="+", help="Limit output to following types, or in comination with --errors-only only log errors if last tag is doesn't have this type.", ) parser.add_argument("--search", type=bytes_from_ascii) parser.add_argument("--no-parse-atoms", action="store_false") args = parser.parse_args() logging.basicConfig(level=logging.INFO) unparsed_data = args.search or args.type errors_count = 0 total_count = 0 for path in progress(scandir_ext(args.path, args.extensions, rec=args.recursive)): if args.errors_only: total_count += 1 exc, res = list_except(enumerate_atoms(fspath(path), parse_atoms=args.no_parse_atoms)) if exc: if args.type is None or (res and res[-1][2] not in args.type): for depth, pos, type, size, _, _ in res: print("--" * depth, pos, type, size, file=stderr) logging.exception("Enumerating atoms of %s failed", path, exc_info=exc) errors_count += 1 else: print(path) for depth, pos, type, size, content, leaf in enumerate_atoms( fspath(path), parse_atoms=args.no_parse_atoms, unparsed_data=unparsed_data ): if args.type and type in args.type: