Example #1
0
def test_deduplicate():
    directory = tempfile.TemporaryDirectory()
    original = testing.DEFAULT_TEST_IMAGES[0]
    duplicate = os.path.join(directory.name, 'image1.jpg')
    shutil.copy(original, duplicate)
    pairs = tools.deduplicate(files=[
        testing.DEFAULT_TEST_IMAGES[0], testing.DEFAULT_TEST_IMAGES[1],
        duplicate
    ],
                              hashers=[(hashers.PHash(hash_size=16), 0.25)])
    assert len(pairs) == 1
    file1, file2 = pairs[0]
    assert ((file1 == duplicate) and
            (file2 == original)) or ((file1 == original) and
                                     (file2 == duplicate))
Example #2
0
def test_benchmark_transforms():
    transformed = dataset.transform(transforms={
        'blur0.05': iaa.GaussianBlur(0.05),
        'noop': iaa.Resize(size=(256, 256))
    },
                                    storage_dir='/tmp/transforms')

    assert len(transformed._df) == len(files) * 2

    hashes = transformed.compute_hashes(hashers={'pdna': hashers.PHash()})
    tr = hashes.compute_threshold_recall().reset_index()

    hashes._metrics = None
    hashes._df.at[0, 'hash'] = None
    with pytest.warns(UserWarning, match='invalid / empty hashes'):
        hashes.compute_threshold_recall()

    assert (tr[tr['transform_name'] == 'noop']['recall'] == 100.0).all()

    # This is a charting function but we execute it just to make sure
    # it runs without error.
    hashes.show_histograms()
Example #3
0
def test_synchronized_hashing():
    video_hashers = {
        'phashframewise':
        hashers.FramewiseHasher(frame_hasher=hashers.PHash(hash_size=16),
                                frames_per_second=1,
                                interframe_threshold=0.2),
        'tmkl2':
        hashers.TMKL2(frames_per_second=15),
        'tmkl1':
        hashers.TMKL1(frames_per_second=15)
    }

    for filepath in [
            'perception/testing/videos/v1.m4v',
            'perception/testing/videos/v2.m4v'
    ]:
        # Ensure synchronized hashing
        hashes1 = {
            hasher_name: hasher.compute(filepath)
            for hasher_name, hasher in video_hashers.items()
        }
        hashes2 = hashers.tools.compute_synchronized_video_hashes(
            filepath=filepath, hashers=video_hashers)
        assert hashes1 == hashes2
Example #4
0
def test_video_hashing_common():
    testing.test_video_hasher_integrity(
        hasher=hashers.FramewiseHasher(frame_hasher=hashers.PHash(
            hash_size=16),
                                       interframe_threshold=0.1,
                                       frames_per_second=1))
Example #5
0
def test_video_benchmark_dataset():
    video_dataset = benchmarking.BenchmarkVideoDataset.from_tuples(
        files=[('perception/testing/videos/v1.m4v',
                'category1'), ('perception/testing/videos/v2.m4v',
                               'category1'),
               ('perception/testing/videos/v1.m4v',
                'category2'), ('perception/testing/videos/v2.m4v',
                               'category2')])
    transforms = {
        'noop':
        benchmarking.video_transforms.get_simple_transform(width=128,
                                                           sar='1/1'),
        'gif':
        benchmarking.video_transforms.get_simple_transform(codec='gif',
                                                           output_ext='.gif'),
        'clip1s':
        benchmarking.video_transforms.get_simple_transform(clip_s=(1, None)),
        'blackpad':
        benchmarking.video_transforms.get_black_frame_padding_transform(
            duration_s=1),
        'slideshow':
        benchmarking.video_transforms.get_slideshow_transform(
            frame_input_rate=1, frame_output_rate=1),
    }
    transformed = video_dataset.transform(
        storage_dir=tempfile.TemporaryDirectory().name, transforms=transforms)
    assert len(transformed._df) == len(transforms) * len(video_dataset._df)
    assert transformed._df['filepath'].isnull().sum() == 0

    # We will compute hashes for each of the transformed
    # videos and check the results for correctness.
    phash_framewise_hasher = hashers.FramewiseHasher(
        frame_hasher=hashers.PHash(),
        interframe_threshold=-1,
        frames_per_second=2)
    hashes = transformed.compute_hashes(
        hashers={'phashframewise': phash_framewise_hasher})

    guid = hashes._df.guid.iloc[0]
    df = hashes._df[hashes._df['guid'] == guid]
    clip1s = df[(df.transform_name == 'clip1s')]
    noop = df[(df.transform_name == 'noop')]
    blackpad = df[(df.transform_name == 'blackpad')]
    slideshow = df[(df.transform_name == 'slideshow')]

    # We should have dropped two hashes from the beginning
    # on the clipped video.
    assert len(clip1s) == len(noop) - 2

    # The first hash from the clipped video should be the
    # same as the third hash from the original
    assert clip1s.hash.iloc[0] == noop.hash.iloc[2]

    # The black padding adds four hashes (two on either side).
    assert len(blackpad) == len(noop) + 4

    # A black frame should yield all zeros for PHash
    assert phash_framewise_hasher.string_to_vector(
        blackpad.iloc[0].hash).sum() == 0

    # The slideshow hashes should be the same as the noop
    # hashes for every other hash.
    assert (noop.hash.values[::2] == slideshow.hash.values[::2]).all()

    # Every second hash in the slideshow should be the same as the
    # previous one.
    for n in range(0, 10, 2):
        assert slideshow.hash.values[n] == slideshow.hash.values[n + 1]
Example #6
0
def is_img_suspicious(img):
    duplicate_pairs = tools.deduplicate(files=filepaths,
                                        hashers=[(hashers.PHash(hash_size=16),
                                                  0.2)])
    print(duplicate_pairs)