Example #1
0
 def test_imagehash_time(self):
     timea = imagehash.ImageHash(np.array(list(np.binary_repr(self.timed_example[0])))[:50])
     timeb = imagehash.ImageHash(np.array(list(np.binary_repr(self.timed_example[1])))[:50])
     start = time.perf_counter()
     for j in range(10000):
         timea - timeb
     print(((time.perf_counter() - start) / 10000) * 10**6, " ms average open source imagehash hamming time")
Example #2
0
def cmp_hash(first, second, delta=5):
    if isinstance(first, str):
        _hash = int(first, 16)
        first = imagehash.ImageHash(
            np.array(
                [bool((_hash >> i) & 1) for i in range(64 * 4 - 1, -1, -1)]))
    if isinstance(second, str):
        _hash = int(second, 16)
        second = imagehash.ImageHash(
            np.array(
                [bool((_hash >> i) & 1) for i in range(64 * 4 - 1, -1, -1)]))

    return first - second <= delta
Example #3
0
def average_hash(image: np.ndarray, hash_size=8, mean=np.mean):
    """
    Average Hash computation

    Implementation follows http://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html

    Step by step explanation: https://web.archive.org/web/20171112054354/https://www.safaribooksonline.com/blog/2013/11/26/image-hashing-with-python/

    @image must be a PIL instance.
    @mean how to determine the average luminescence. can try numpy.median instead.
    """
    if hash_size < 2:
        raise ValueError("Hash size must be greater than or equal to 2")

    # reduce size and complexity, then covert to grayscale
    # image = image.convert("L").resize((hash_size, hash_size), Image.ANTIALIAS)
    image = cv2.resize(
        cv2.cvtColor(image, cv2.COLOR_BGR2GRAY), ((hash_size, hash_size))
    )  # find average pixel value; 'pixels' is an array of the pixel values, ranging from 0 (black) to 255 (white)
    pixels = np.asarray(image)
    avg = mean(pixels)

    # create string of bits
    diff = pixels > avg
    # make a hash
    return imagehash.ImageHash(diff)
Example #4
0
def hex_to_hash(hexstr):
    l = []
    if len(hexstr) != 16:
        raise ValueError('The hex string has the wrong length')
    for i in range(8):
        h = hexstr[i * 2:i * 2 + 2]
        v = int("0x" + h, 16)
        l.append([v & 2**i > 0 for i in range(8)])
    return imagehash.ImageHash(np.array(l))
Example #5
0
def test_dupe_json_command(capsys) -> None:  # type: ignore
    """
    Test that dupe command is called correctly.

    The call will also be with json format, so ensure the output is a JSON document with the right
    structure.
    """
    search_paths = [Path("foo/bar"), Path("lol/123")]
    algorithm = Algorithm.DHASH
    format_ = Format.JSON
    algo_params = {"hash_size": 4}

    namespace = Namespace(
        search_paths=search_paths,
        algorithm=algorithm,
        format=format_,
        algo_params=algo_params,
    )

    with patch("imagesearch.cli.command.Dupe") as mock_dupe:
        # setup
        mock_dupe.find.return_value = iter([
            Dupe(
                image_hash=imagehash.ImageHash(
                    numpy.array([True, False, True, False])),
                algorithm=algorithm,
                paths=set([
                    Path("foo/bar/img.jpg"),
                    Path("lol/123/img.jpg"),
                ]),
            ),
        ])

        # run
        DupeCommand.run(namespace)

        # verify
        mock_dupe.find.assert_called_once_with(search_paths=search_paths,
                                               algorithm=algorithm,
                                               algo_params=algo_params)

    # verify json structure
    capjson = json.loads(capsys.readouterr().out)

    assert capjson["algorithm"] == algorithm.algo_name  # pylint: disable=no-member
    assert len(capjson["dupes"]) == 1

    json_dupe = capjson["dupes"][0]
    assert set(json_dupe.keys()) == set(["image_hash", "paths"])
Example #6
0
def test_dupe_text_output(capsys) -> None:  # type: ignore
    """
    Tests that dupe text output is built.

    Since the text format is not designed to be machine-readable, this test doesn't test the format.
    """
    search_paths = [Path("foo/bar"), Path("lol/123")]
    algorithm = Algorithm.DHASH
    algo_params = {"hash_size": 4}
    format_ = Format.TEXT

    namespace = Namespace(
        search_paths=search_paths,
        algorithm=algorithm,
        format=format_,
        algo_params=algo_params,
    )

    dupe_image_paths = [
        Path("foo/bar/img.jpg"),
        Path("lol/123/img.jpg"),
    ]

    with patch("imagesearch.cli.command.Dupe") as mock_dupe:
        # setup
        mock_dupe.find.return_value = iter([
            Dupe(
                image_hash=imagehash.ImageHash(
                    numpy.array([True, False, True, False])),
                algorithm=algorithm,
                paths=set(dupe_image_paths),
            ),
        ])

        # run
        DupeCommand.run(namespace)

        # verify
        mock_dupe.find.assert_called_once_with(
            search_paths=search_paths,
            algorithm=algorithm,
            algo_params=algo_params,
        )

    capout = capsys.readouterr().out

    for path in dupe_image_paths:
        assert str(path) in capout
Example #7
0
    def phash_org(self, image, hash_size=8, highfreq_factor=4):
        if hash_size < 2:
            raise ValueError("Hash size must be greater than or equal to 2")

        import scipy.fftpack
        img_size = hash_size * highfreq_factor
        image = image.convert("L").resize((img_size, img_size),
                                          Image.ANTIALIAS)
        pixels = numpy.asarray(image)
        dct = scipy.fftpack.dct(scipy.fftpack.dct(pixels, axis=0), axis=1)
        # using only the 8x8 DCT low-frequency values and excluding the first term since the DC coefficient
        # can be significantly different from the other valuesand will throw off the average.
        dctlowfreq = dct[1:hash_size + 1, 1:hash_size + 1]
        med = numpy.median(dctlowfreq)
        diff = dctlowfreq > med
        return imagehash.ImageHash(diff)
Example #8
0
    def __init__(self, binary_array, dim=8):
        if isinstance(binary_array, imagehash.ImageHash):
            binary_array = binary_array.hash

        self.full = imagehash.ImageHash(binary_array)
        self.center = self._imagehash_slice(binary_array, (1, 1),
                                            (dim - 1, dim - 1))

        corners = [
            [(0, 0), (dim - 2, dim - 2)],
            [(0, 1), (dim - 2, dim - 1)],
            [(0, 2), (dim - 2, dim)],
            [(1, 0), (dim - 1, dim - 2)],
            [(1, 1), (dim - 1, dim - 1)],
            [(1, 2), (dim - 1, dim)],
            [(2, 0), (dim, dim - 2)],
            [(2, 1), (dim, dim - 1)],
            [(2, 2), (dim, dim)],
        ]
        self.corners = [
            self._imagehash_slice(binary_array, *c) for c in corners
        ]
Example #9
0
 def _imagehash_slice(self, binary_array, start, end):
     startX, startY = start
     endX, endY = end
     res = binary_array[startX:endX, startY:endY]
     return imagehash.ImageHash(res)
Example #10
0
 def calc_rotation(self, image: im.ImageHash) -> im.ImageHash:
     return im.ImageHash(np.array(list(zip(*image.hash[::-1]))))
Example #11
0
def whash_(image):
    ''' bypass assert for small image '''
    try:
        return imagehash.whash(image)
    except AssertionError:
        return imagehash.ImageHash(np.zeros((8,8), dtype=bool))
Example #12
0
 def imghash(self):
     _hash = int(self.hash, 16)
     return imagehash.ImageHash(
         np.array(
             [bool((_hash >> i) & 1) for i in range(64 * 4 - 1, -1, -1)]))
Example #13
0
 def setUp(self):
     self.black = imagehash.ImageHash(numpy.zeros((3, 2, 2)))
     self.gray = imagehash.ImageHash(numpy.ones((3, 2, 2)))
     self.white = imagehash.ImageHash(numpy.full((3, 2, 2), 255))
Example #14
0
def go(algorithm):
    """
    Compute hashes (and store) them, and then run clustering for a particular hash algorithm
    :param algorithm: the key for the algorithm to run (@see ALGORITHMS)
    # """
    start_time = time.time()
    output = os.path.join(OUTPUT, algorithm)
    print('Running for', algorithm, ', writing to ', output)

    hashfn, max_threshold, threshold_step = ALGORITHMS[algorithm]

    hashes = {}

    hashes_json_path = os.path.join(output, 'hashes.json')
    if os.path.exists(hashes_json_path):
        print('Loading from ', hashes_json_path)
        with open(hashes_json_path) as inf:
            data = json.load(inf)
        for f, h in data.items():
            hashes[f] = imagehash.ImageHash(np.array(h))
    else:
        pool = multiprocessing.Pool(processes=max(1, os.cpu_count() - 1))
        l = [(i, f, algorithm)
             for i, f in enumerate(sorted(os.listdir(IMAGEDIR)))]
        result = pool.imap_unordered(_hash, l)
        hashes = {f: imagehash.ImageHash(np.array(h)) for f, h in result}

    files = set(hashes.keys())
    hashes_sorted = sorted(hashes.items(), key=lambda h: str(h[1]))

    print('Hashes computed ', time.time() - start_time)

    os.makedirs(output, exist_ok=True)
    hashes_to_write = {}
    for f, h in hashes_sorted:
        hashes_to_write[f] = h.hash.tolist()
    with open(hashes_json_path, 'w') as outf:
        json.dump(hashes_to_write, outf)

    print('Hashes persisted ', time.time() - start_time)

    diffs = []
    for i1, h1 in enumerate(hashes_sorted):
        if i1 % 100 == 0:
            print('diff', i1, ' time ', time.time() - start_time)
        for i2, h2 in enumerate(hashes_sorted):
            if i1 < i2:
                diff = h1[1] - h2[1]
                if diff < max_threshold:
                    diffs.append((diff, h1[0], h2[0]))
    # diffs = sorted(diffs, key=lambda d: d[0])
    print('Diffs computed ', time.time() - start_time)

    # with open(os.path.join(output, 'diffs.txt'), 'w') as outf:
    #     for d in diffs:
    #         print(d, file=outf)

    # print('Min diffs:\n', '\n'.join(str(d) for d in diffs[:20]))
    # print('Max diffs:\n', '\n'.join(str(d) for d in diffs[-20:]))
    # print('Avg diff: ', sum(d[0] for d in diffs) / len(diffs))

    for threshold in range(0, max_threshold, threshold_step):
        print('Copying for threshold {}'.format(threshold),
              time.time() - start_time)

        neighbors = {}
        for d in diffs:
            if d[0] <= threshold:
                neighbors.setdefault(d[1], set()).add(d[2])
                neighbors.setdefault(d[2], set()).add(d[1])

        clusters = list(
            sorted(connected_components(neighbors),
                   key=lambda c: len(c),
                   reverse=True))

        # print(clusters)
        in_clusters = set().union(*clusters)
        unclustered = files - in_clusters

        destdir = os.path.join(
            output, 'thr_{}_unclustered_{}'.format(
                str(threshold).zfill(3), len(unclustered)))
        shutil.rmtree(destdir, ignore_errors=True)
        os.makedirs(destdir)

        for cnt, cluster in enumerate(clusters + [unclustered]):
            if cnt == len(clusters):
                name = 'unclustered'
            else:
                name = str(cnt + 1).zfill(3)
            cdir = os.path.join(destdir, '{}_{}'.format(name, len(cluster)))
            os.makedirs(cdir)
            for f in cluster:
                fname, ext = os.path.splitext(f)
                filename = '{}_{}{}'.format(fname, str(hashes[f]), ext)
                os.symlink(os.path.abspath(os.path.join(IMAGEDIR, f)),
                           os.path.join(cdir, filename))

    end_time = time.time()

    print('Time taken: ', round(end_time - start_time, 2), 'sec\n\n')