def test_imagehash_time(self): timea = imagehash.ImageHash(np.array(list(np.binary_repr(self.timed_example[0])))[:50]) timeb = imagehash.ImageHash(np.array(list(np.binary_repr(self.timed_example[1])))[:50]) start = time.perf_counter() for j in range(10000): timea - timeb print(((time.perf_counter() - start) / 10000) * 10**6, " ms average open source imagehash hamming time")
def cmp_hash(first, second, delta=5): if isinstance(first, str): _hash = int(first, 16) first = imagehash.ImageHash( np.array( [bool((_hash >> i) & 1) for i in range(64 * 4 - 1, -1, -1)])) if isinstance(second, str): _hash = int(second, 16) second = imagehash.ImageHash( np.array( [bool((_hash >> i) & 1) for i in range(64 * 4 - 1, -1, -1)])) return first - second <= delta
def average_hash(image: np.ndarray, hash_size=8, mean=np.mean): """ Average Hash computation Implementation follows http://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html Step by step explanation: https://web.archive.org/web/20171112054354/https://www.safaribooksonline.com/blog/2013/11/26/image-hashing-with-python/ @image must be a PIL instance. @mean how to determine the average luminescence. can try numpy.median instead. """ if hash_size < 2: raise ValueError("Hash size must be greater than or equal to 2") # reduce size and complexity, then covert to grayscale # image = image.convert("L").resize((hash_size, hash_size), Image.ANTIALIAS) image = cv2.resize( cv2.cvtColor(image, cv2.COLOR_BGR2GRAY), ((hash_size, hash_size)) ) # find average pixel value; 'pixels' is an array of the pixel values, ranging from 0 (black) to 255 (white) pixels = np.asarray(image) avg = mean(pixels) # create string of bits diff = pixels > avg # make a hash return imagehash.ImageHash(diff)
def hex_to_hash(hexstr): l = [] if len(hexstr) != 16: raise ValueError('The hex string has the wrong length') for i in range(8): h = hexstr[i * 2:i * 2 + 2] v = int("0x" + h, 16) l.append([v & 2**i > 0 for i in range(8)]) return imagehash.ImageHash(np.array(l))
def test_dupe_json_command(capsys) -> None: # type: ignore """ Test that dupe command is called correctly. The call will also be with json format, so ensure the output is a JSON document with the right structure. """ search_paths = [Path("foo/bar"), Path("lol/123")] algorithm = Algorithm.DHASH format_ = Format.JSON algo_params = {"hash_size": 4} namespace = Namespace( search_paths=search_paths, algorithm=algorithm, format=format_, algo_params=algo_params, ) with patch("imagesearch.cli.command.Dupe") as mock_dupe: # setup mock_dupe.find.return_value = iter([ Dupe( image_hash=imagehash.ImageHash( numpy.array([True, False, True, False])), algorithm=algorithm, paths=set([ Path("foo/bar/img.jpg"), Path("lol/123/img.jpg"), ]), ), ]) # run DupeCommand.run(namespace) # verify mock_dupe.find.assert_called_once_with(search_paths=search_paths, algorithm=algorithm, algo_params=algo_params) # verify json structure capjson = json.loads(capsys.readouterr().out) assert capjson["algorithm"] == algorithm.algo_name # pylint: disable=no-member assert len(capjson["dupes"]) == 1 json_dupe = capjson["dupes"][0] assert set(json_dupe.keys()) == set(["image_hash", "paths"])
def test_dupe_text_output(capsys) -> None: # type: ignore """ Tests that dupe text output is built. Since the text format is not designed to be machine-readable, this test doesn't test the format. """ search_paths = [Path("foo/bar"), Path("lol/123")] algorithm = Algorithm.DHASH algo_params = {"hash_size": 4} format_ = Format.TEXT namespace = Namespace( search_paths=search_paths, algorithm=algorithm, format=format_, algo_params=algo_params, ) dupe_image_paths = [ Path("foo/bar/img.jpg"), Path("lol/123/img.jpg"), ] with patch("imagesearch.cli.command.Dupe") as mock_dupe: # setup mock_dupe.find.return_value = iter([ Dupe( image_hash=imagehash.ImageHash( numpy.array([True, False, True, False])), algorithm=algorithm, paths=set(dupe_image_paths), ), ]) # run DupeCommand.run(namespace) # verify mock_dupe.find.assert_called_once_with( search_paths=search_paths, algorithm=algorithm, algo_params=algo_params, ) capout = capsys.readouterr().out for path in dupe_image_paths: assert str(path) in capout
def phash_org(self, image, hash_size=8, highfreq_factor=4): if hash_size < 2: raise ValueError("Hash size must be greater than or equal to 2") import scipy.fftpack img_size = hash_size * highfreq_factor image = image.convert("L").resize((img_size, img_size), Image.ANTIALIAS) pixels = numpy.asarray(image) dct = scipy.fftpack.dct(scipy.fftpack.dct(pixels, axis=0), axis=1) # using only the 8x8 DCT low-frequency values and excluding the first term since the DC coefficient # can be significantly different from the other valuesand will throw off the average. dctlowfreq = dct[1:hash_size + 1, 1:hash_size + 1] med = numpy.median(dctlowfreq) diff = dctlowfreq > med return imagehash.ImageHash(diff)
def __init__(self, binary_array, dim=8): if isinstance(binary_array, imagehash.ImageHash): binary_array = binary_array.hash self.full = imagehash.ImageHash(binary_array) self.center = self._imagehash_slice(binary_array, (1, 1), (dim - 1, dim - 1)) corners = [ [(0, 0), (dim - 2, dim - 2)], [(0, 1), (dim - 2, dim - 1)], [(0, 2), (dim - 2, dim)], [(1, 0), (dim - 1, dim - 2)], [(1, 1), (dim - 1, dim - 1)], [(1, 2), (dim - 1, dim)], [(2, 0), (dim, dim - 2)], [(2, 1), (dim, dim - 1)], [(2, 2), (dim, dim)], ] self.corners = [ self._imagehash_slice(binary_array, *c) for c in corners ]
def _imagehash_slice(self, binary_array, start, end): startX, startY = start endX, endY = end res = binary_array[startX:endX, startY:endY] return imagehash.ImageHash(res)
def calc_rotation(self, image: im.ImageHash) -> im.ImageHash: return im.ImageHash(np.array(list(zip(*image.hash[::-1]))))
def whash_(image): ''' bypass assert for small image ''' try: return imagehash.whash(image) except AssertionError: return imagehash.ImageHash(np.zeros((8,8), dtype=bool))
def imghash(self): _hash = int(self.hash, 16) return imagehash.ImageHash( np.array( [bool((_hash >> i) & 1) for i in range(64 * 4 - 1, -1, -1)]))
def setUp(self): self.black = imagehash.ImageHash(numpy.zeros((3, 2, 2))) self.gray = imagehash.ImageHash(numpy.ones((3, 2, 2))) self.white = imagehash.ImageHash(numpy.full((3, 2, 2), 255))
def go(algorithm): """ Compute hashes (and store) them, and then run clustering for a particular hash algorithm :param algorithm: the key for the algorithm to run (@see ALGORITHMS) # """ start_time = time.time() output = os.path.join(OUTPUT, algorithm) print('Running for', algorithm, ', writing to ', output) hashfn, max_threshold, threshold_step = ALGORITHMS[algorithm] hashes = {} hashes_json_path = os.path.join(output, 'hashes.json') if os.path.exists(hashes_json_path): print('Loading from ', hashes_json_path) with open(hashes_json_path) as inf: data = json.load(inf) for f, h in data.items(): hashes[f] = imagehash.ImageHash(np.array(h)) else: pool = multiprocessing.Pool(processes=max(1, os.cpu_count() - 1)) l = [(i, f, algorithm) for i, f in enumerate(sorted(os.listdir(IMAGEDIR)))] result = pool.imap_unordered(_hash, l) hashes = {f: imagehash.ImageHash(np.array(h)) for f, h in result} files = set(hashes.keys()) hashes_sorted = sorted(hashes.items(), key=lambda h: str(h[1])) print('Hashes computed ', time.time() - start_time) os.makedirs(output, exist_ok=True) hashes_to_write = {} for f, h in hashes_sorted: hashes_to_write[f] = h.hash.tolist() with open(hashes_json_path, 'w') as outf: json.dump(hashes_to_write, outf) print('Hashes persisted ', time.time() - start_time) diffs = [] for i1, h1 in enumerate(hashes_sorted): if i1 % 100 == 0: print('diff', i1, ' time ', time.time() - start_time) for i2, h2 in enumerate(hashes_sorted): if i1 < i2: diff = h1[1] - h2[1] if diff < max_threshold: diffs.append((diff, h1[0], h2[0])) # diffs = sorted(diffs, key=lambda d: d[0]) print('Diffs computed ', time.time() - start_time) # with open(os.path.join(output, 'diffs.txt'), 'w') as outf: # for d in diffs: # print(d, file=outf) # print('Min diffs:\n', '\n'.join(str(d) for d in diffs[:20])) # print('Max diffs:\n', '\n'.join(str(d) for d in diffs[-20:])) # print('Avg diff: ', sum(d[0] for d in diffs) / len(diffs)) for threshold in range(0, max_threshold, threshold_step): print('Copying for threshold {}'.format(threshold), time.time() - start_time) neighbors = {} for d in diffs: if d[0] <= threshold: neighbors.setdefault(d[1], set()).add(d[2]) neighbors.setdefault(d[2], set()).add(d[1]) clusters = list( sorted(connected_components(neighbors), key=lambda c: len(c), reverse=True)) # print(clusters) in_clusters = set().union(*clusters) unclustered = files - in_clusters destdir = os.path.join( output, 'thr_{}_unclustered_{}'.format( str(threshold).zfill(3), len(unclustered))) shutil.rmtree(destdir, ignore_errors=True) os.makedirs(destdir) for cnt, cluster in enumerate(clusters + [unclustered]): if cnt == len(clusters): name = 'unclustered' else: name = str(cnt + 1).zfill(3) cdir = os.path.join(destdir, '{}_{}'.format(name, len(cluster))) os.makedirs(cdir) for f in cluster: fname, ext = os.path.splitext(f) filename = '{}_{}{}'.format(fname, str(hashes[f]), ext) os.symlink(os.path.abspath(os.path.join(IMAGEDIR, f)), os.path.join(cdir, filename)) end_time = time.time() print('Time taken: ', round(end_time - start_time, 2), 'sec\n\n')