def test_deduplicate(): directory = tempfile.TemporaryDirectory() original = testing.DEFAULT_TEST_IMAGES[0] duplicate = os.path.join(directory.name, 'image1.jpg') shutil.copy(original, duplicate) pairs = tools.deduplicate(files=[ testing.DEFAULT_TEST_IMAGES[0], testing.DEFAULT_TEST_IMAGES[1], duplicate ], hashers=[(hashers.PHash(hash_size=16), 0.25)]) assert len(pairs) == 1 file1, file2 = pairs[0] assert ((file1 == duplicate) and (file2 == original)) or ((file1 == original) and (file2 == duplicate))
def test_benchmark_transforms(): transformed = dataset.transform(transforms={ 'blur0.05': iaa.GaussianBlur(0.05), 'noop': iaa.Resize(size=(256, 256)) }, storage_dir='/tmp/transforms') assert len(transformed._df) == len(files) * 2 hashes = transformed.compute_hashes(hashers={'pdna': hashers.PHash()}) tr = hashes.compute_threshold_recall().reset_index() hashes._metrics = None hashes._df.at[0, 'hash'] = None with pytest.warns(UserWarning, match='invalid / empty hashes'): hashes.compute_threshold_recall() assert (tr[tr['transform_name'] == 'noop']['recall'] == 100.0).all() # This is a charting function but we execute it just to make sure # it runs without error. hashes.show_histograms()
def test_synchronized_hashing(): video_hashers = { 'phashframewise': hashers.FramewiseHasher(frame_hasher=hashers.PHash(hash_size=16), frames_per_second=1, interframe_threshold=0.2), 'tmkl2': hashers.TMKL2(frames_per_second=15), 'tmkl1': hashers.TMKL1(frames_per_second=15) } for filepath in [ 'perception/testing/videos/v1.m4v', 'perception/testing/videos/v2.m4v' ]: # Ensure synchronized hashing hashes1 = { hasher_name: hasher.compute(filepath) for hasher_name, hasher in video_hashers.items() } hashes2 = hashers.tools.compute_synchronized_video_hashes( filepath=filepath, hashers=video_hashers) assert hashes1 == hashes2
def test_video_hashing_common(): testing.test_video_hasher_integrity( hasher=hashers.FramewiseHasher(frame_hasher=hashers.PHash( hash_size=16), interframe_threshold=0.1, frames_per_second=1))
def test_video_benchmark_dataset(): video_dataset = benchmarking.BenchmarkVideoDataset.from_tuples( files=[('perception/testing/videos/v1.m4v', 'category1'), ('perception/testing/videos/v2.m4v', 'category1'), ('perception/testing/videos/v1.m4v', 'category2'), ('perception/testing/videos/v2.m4v', 'category2')]) transforms = { 'noop': benchmarking.video_transforms.get_simple_transform(width=128, sar='1/1'), 'gif': benchmarking.video_transforms.get_simple_transform(codec='gif', output_ext='.gif'), 'clip1s': benchmarking.video_transforms.get_simple_transform(clip_s=(1, None)), 'blackpad': benchmarking.video_transforms.get_black_frame_padding_transform( duration_s=1), 'slideshow': benchmarking.video_transforms.get_slideshow_transform( frame_input_rate=1, frame_output_rate=1), } transformed = video_dataset.transform( storage_dir=tempfile.TemporaryDirectory().name, transforms=transforms) assert len(transformed._df) == len(transforms) * len(video_dataset._df) assert transformed._df['filepath'].isnull().sum() == 0 # We will compute hashes for each of the transformed # videos and check the results for correctness. phash_framewise_hasher = hashers.FramewiseHasher( frame_hasher=hashers.PHash(), interframe_threshold=-1, frames_per_second=2) hashes = transformed.compute_hashes( hashers={'phashframewise': phash_framewise_hasher}) guid = hashes._df.guid.iloc[0] df = hashes._df[hashes._df['guid'] == guid] clip1s = df[(df.transform_name == 'clip1s')] noop = df[(df.transform_name == 'noop')] blackpad = df[(df.transform_name == 'blackpad')] slideshow = df[(df.transform_name == 'slideshow')] # We should have dropped two hashes from the beginning # on the clipped video. assert len(clip1s) == len(noop) - 2 # The first hash from the clipped video should be the # same as the third hash from the original assert clip1s.hash.iloc[0] == noop.hash.iloc[2] # The black padding adds four hashes (two on either side). assert len(blackpad) == len(noop) + 4 # A black frame should yield all zeros for PHash assert phash_framewise_hasher.string_to_vector( blackpad.iloc[0].hash).sum() == 0 # The slideshow hashes should be the same as the noop # hashes for every other hash. assert (noop.hash.values[::2] == slideshow.hash.values[::2]).all() # Every second hash in the slideshow should be the same as the # previous one. for n in range(0, 10, 2): assert slideshow.hash.values[n] == slideshow.hash.values[n + 1]
def is_img_suspicious(img): duplicate_pairs = tools.deduplicate(files=filepaths, hashers=[(hashers.PHash(hash_size=16), 0.2)]) print(duplicate_pairs)