Exemple #1
0
def dupgroups(dirs, hashfunc):
	dups = defaultdict(list)

	logging.info("Collecting files")
	for size, path in progress(iter_size_path(dirs)):
		dups[size].append(path)
	logging.info("Found %s size groups", len(dups))

	logging.info("Filtering files based on size")
	dups = {k: v for k, v in dups.items() if len(v) > 1}
	logging.info("Found %s duplicate size groups", len(dups))

	logging.info("Calculating hash groups")
	for size, hashes in progress(iter_size_hashes(dups, hashfunc), length=len(dups)):
		dups[size] = hashes

	logging.info("Filtering files based on hashes")
	newdups = {}
	for size, sizegroup in dups.items():
		for hash, hashgroup in sizegroup.items():
			if len(hashgroup) > 1:
				newdups[(size, hash)] = hashgroup

	return newdups
Exemple #2
0
def dupgroups_no_size(dirs, hashfunc):
	dups = defaultdict(list)

	logging.info("Calculating hash groups")
	for size, path in progress(iter_size_path(dirs)):
		try:
			hash = hashfunc(path)
		except PermissionError:
			logging.warning("Permission denied: %s", path)
		except FileNotFoundError:
			logging.warning("File not found: %s", path)
		except Skip:
			pass
		else:
			dups[hash].append((path, size))
	logging.info("Found %s hash groups", len(dups))

	logging.info("Filtering files based on hash")
	dups = {k: v for k, v in dups.items() if len(v) > 1}
	logging.info("Found %s duplicate hash groups", len(dups))

	return dups
Exemple #3
0
 def test_progress(self):
     r = range(1000)
     with open(devnull, "w") as fw:
         self.assertIterEqual(r, progress(r, file=fw))
Exemple #4
0
        help="Directory to store temporary benchmarking databases")
    parser.add_argument("--version", action="version", version=__version__)
    parser.add_argument(
        "--sizes",
        nargs="+",
        type=int,
        metavar="N",
        default=[10, 100, 10**3, 10**4, 10**5, 10**6],
        help="Number of records to read/write",
    )
    parser.add_argument("--bestof",
                        type=int,
                        metavar="N",
                        default=3,
                        help="Run N benchmarks")
    parser.add_argument("--outfile",
                        default="benchmarks.md",
                        help="Benchmark results")
    args = parser.parse_args()

    with open(args.outfile, "wt", encoding="utf-8") as fw:
        results: List[ResultsDict] = []

        for _ in progress(range(args.bestof)):
            results.append(bench(args.outpath, args.sizes))

        best_results = merge_results(results)

        write_markdown_table(fw, best_results, "write")
        write_markdown_table(fw, best_results, "read")
Exemple #5
0
    parser.add_argument(
        "--type",
        nargs="+",
        help="Limit output to following types, or in comination with --errors-only only log errors if last tag is doesn't have this type.",
    )
    parser.add_argument("--search", type=bytes_from_ascii)
    parser.add_argument("--no-parse-atoms", action="store_false")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)

    unparsed_data = args.search or args.type

    errors_count = 0
    total_count = 0
    for path in progress(scandir_ext(args.path, args.extensions, rec=args.recursive)):
        if args.errors_only:
            total_count += 1
            exc, res = list_except(enumerate_atoms(fspath(path), parse_atoms=args.no_parse_atoms))
            if exc:
                if args.type is None or (res and res[-1][2] not in args.type):
                    for depth, pos, type, size, _, _ in res:
                        print("--" * depth, pos, type, size, file=stderr)
                    logging.exception("Enumerating atoms of %s failed", path, exc_info=exc)
                    errors_count += 1
        else:
            print(path)
            for depth, pos, type, size, content, leaf in enumerate_atoms(
                fspath(path), parse_atoms=args.no_parse_atoms, unparsed_data=unparsed_data
            ):
                if args.type and type in args.type: