def main(): parser = argparse.ArgumentParser( description='Generate image signatures for files in database.') add_common_command_line_arguments(parser) parser.add_argument('-T', '--threads', metavar='THREADS', nargs=1, default=[DEFAULT_THREADS], help=("Specify the number of threads to use. 'C' is " + "substitued for the number of cores. " + "Default is %r. Examples: '1', '10' or '1.5C'.") % DEFAULT_THREADS) parser.add_argument('-s', '--min-similarity', default=0.8, help='require at least this image similarity') parser.add_argument('-S', '--signatures', nargs=1, default="3", help=("Which signatures should be computed. " + "Default is %r. Examples: '1,2', '3' or 'all'.") % "3") args = parser.parse_args() repo = dfr.db.Database(args.db[0]) signatures = args.signatures[0] if signatures == "all": signatures = [1, 2, 3, 4, 5] else: signatures = [int(x) for x in signatures.split(",")] threads = eval_thread_config(args.threads[0]) indexer = ImageIndexer(repo, signatures, parallel_threads=threads) indexer.run() sim = float(args.min_similarity) for iht in signatures: comperator = ImageComperator(repo, iht) comperator.ensure_that_differences_are_calculated(sim)
def main(): parser = argparse.ArgumentParser( description='Index directories recursive.') parser.add_argument('roots', metavar='DIR', nargs='*', default=["."], help="a directory to index " + "(if not given '.' will be used)") add_common_command_line_arguments(parser) parser.add_argument('-x', '--exclude-files', metavar='GLOBPATTERNS', nargs=1, action="append", default=[DEFAULT_FILE_EXCLUDE], help=("Exclude files based on comma separated " + "glob patterns. Default is %r.") % DEFAULT_FILE_EXCLUDE) parser.add_argument('-X', '--exclude-dirs', metavar='GLOBPATTERNS', nargs=1, action="append", default=[DEFAULT_DIR_EXCLUDE], help=("Exclude directories based on comma separated " + "glob patterns. Default is %r.") % DEFAULT_DIR_EXCLUDE) args = parser.parse_args() repo = dfr.db.Database(args.db[0]) excluded_files = cleanup(args.exclude_files, DEFAULT_FILE_EXCLUDE) excluded_dirs = cleanup(args.exclude_dirs, DEFAULT_DIR_EXCLUDE) excluded_files = globs_to_regexp(excluded_files) excluded_dirs = globs_to_regexp(excluded_dirs) indexer = BitIndexer(repo, excluded_files, excluded_dirs) indexer.run(args.roots)
def main(): parser = argparse.ArgumentParser( description='Reports statistics about the different image signatures.') parser.add_argument('roots', metavar='DIR', nargs='*', default=["."], help="a directory to scan for duplicate files " + "(if not given '.' will be used)") add_common_command_line_arguments(parser) parser.add_argument('-s', '--min-similarity', default=0.9, help='require at least this image similarity') args = parser.parse_args() repo = dfr.db.Database(args.db[0]) known = repo.imagefeedback.find() positive = [x for x in known if x.aresimilar == 1] print ("There are %d classified image pairs. %d (%.1f%%) " + "are classifized as similar.") % \ (len(known), len(positive), (100.0*len(positive))/len(known)) print ("%10s | %12s | %10s | %12s | %10s | %10s | %10s " + "| %10s | %10s | %10s") % \ ("Signature", "Description", "Detected", "Classified", "TP", "FP", "FN", "Precision", "Recall", "F-Measure") for sig, sim in [(1, 0.95), (2, 0.999), (3, 0.9), (4, 0.95), (5, 0.8)]: finder = ImageSimilarFinder(repo, args.roots, sig, 0) pairs = list(finder.find(sim)) report(sig, pairs, known)
def test_add_common_command_line_arguments(self): parser = argparse.ArgumentParser() add_common_command_line_arguments(parser) default_db = parser.parse_args([]).db[0] self.assertTrue(default_db.startswith("/")) self.assertTrue(default_db.endswith("files.sdb")) self.assertEqual(parser.parse_args(["--db-file", "foo"]).db[0], "foo") self.assertEqual(parser.parse_args(["--db-file=foo"]).db[0], "foo")
def main(): parser = argparse.ArgumentParser( description='Find files with equal or similar content.') parser.add_argument('roots', metavar='DIR', nargs='*', default=["."], help="a directory to scan for duplicate files " + "(if not given '.' will be used)") add_common_command_line_arguments(parser) parser.add_argument('-t', '--output-type', default="interactive", help='determine the output type. Valid values are ' + '"interactive", "csv" and "json". ' + 'Default is "interactive".') parser.add_argument('-o', '--output', default="-", help='The output file name. "-" stands for stdout. ' + 'Default is "-".') parser.add_argument('-w', '--what', default="bitequal", help='determine what is searched. Valid values are ' + '"bitequal" for files which are equal ' + 'bit-wise, "truncated" for files which are ' + 'truncated (the larger files consists of the ' + 'smaller file and some extra content ' + 'afterwards) and "image" to search for similar ' + 'images. Default is "bitequal".') parser.add_argument('-s', '--min-similarity', default=0.9, help='require at least this image similarity. ' + 'Default is "0.9".') parser.add_argument('-S', '--image-signature', default=3, help='Image signature to use. Valid is 1, 2, 3, 4 ' + 'and 5. Default is "3".') parser.add_argument('-n', '--dry-run', action="store_true", dest='dry_run', help='do not delete any files') args = parser.parse_args() repo = dfr.db.Database(args.db[0]) if args.what == "image": finder = ImageSimilarFinder(repo, args.roots, int(args.image_signature)) if args.output_type == "json": resolver = JsonImageSimilarResolver(args.output) finder = ImageSimilarBucketFinder(repo, args.roots, int(args.image_signature)) elif args.output_type == "csv": resolver = CsvImageSimilarResolver(args.output) else: resolver = GuiImageSimilarResolver(args.dry_run) found_items = finder.find(float(args.min_similarity)) elif args.what == "truncated": if args.output_type == "csv": resolver = CsvBitTruncatedResolver(args.output) else: resolver = InteractiveBitTruncatedResolver(args.dry_run) finder = BitTruncatedFinder(repo, args.roots) found_items = finder.find() else: finder = BitEqualFinder(repo, args.roots) if args.output_type == "json": resolver = JsonBitEqualResolver(args.output) finder = BitEqualBucketFinder(repo, args.roots) elif args.output_type == "csv": resolver = CsvBitEqualResolver(args.output) else: resolver = InteractiveBitEqualResolver(args.dry_run) found_items = finder.find() for item in found_items: resolver.resolve(item) resolver.finished()