-
Notifications
You must be signed in to change notification settings - Fork 0
/
findSim.py
executable file
·107 lines (90 loc) · 3.36 KB
/
findSim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from PIL import Image
import imagehash
import distance
import urllib2 as urllib
import io
import json
import traceback
import sys
"""
Demo of hashing
"""
cache = {}
def find_similar_images_by_url(inputUrl):
return find_similar_images("localS3Images", imagehash.dhash, inputUrl)
def find_similar_images_by_file_path(path):
return find_similar_images("localS3Images", imagehash.dhash, "", path)
def find_similar_images(userpath = "localS3Images", hashfunc = imagehash.dhash, inputUrl = "https://s3.amazonaws.com/treblalee.images/watches7.jpg", inputFilePath = ""):
import os
global cache
def is_image(filename):
f = filename.lower()
return f.endswith(".png") or f.endswith(".jpg") or \
f.endswith(".jpeg") or f.endswith(".bmp") or f.endswith(".gif")
# get image url to detail page mapping
imageToDetailPageMapping = {}
mappingFile = open("imageUrlToDetailPageMapping.txt")
for line in mappingFile:
fields = line.strip("\n").split(",")
imageUrl = str(urllib.unquote(fields[0]).decode('utf8'))
detailPageUrl = str(urllib.unquote(fields[1]).decode('utf8'))
imageToDetailPageMapping[imageUrl] = detailPageUrl
# compute hash of input image
try:
if len(inputFilePath) > 0:
image_file = inputFilePath
inputAsString = inputFilePath
else:
fd = urllib.urlopen(inputUrl)
image_file = io.BytesIO(fd.read())
inputAsString = inputUrl
inputHash = str(hashfunc(Image.open(image_file)))
except:
traceback.print_exc(file=sys.stdout)
return json.dumps({}, sort_keys=True, indent=4, separators=(',', ': '))
# compute hashes of all images in DB (currently just a directory)
image_filenames = [os.path.join(userpath, path) for path in os.listdir(userpath) if is_image(path)]
simList = []
for img in sorted(image_filenames):
if img in cache:
hash = cache[img]
else:
hash = str(hashfunc(Image.open(img)))
cache[img] = hash
dist = distance.hamming(inputHash, hash)
print inputHash + " " + hash + " " + str(dist)
if dist < 6 and dist > 0:
imageUrl = img.replace('localS3Images/', 'https://s3.amazonaws.com/treblalee.images/')
detailPageUrl = imageToDetailPageMapping[imageUrl]
pair = {}
pair["imageUrl"] = imageUrl
pair["detailPageUrl"] = detailPageUrl
simList.append(pair)
result = {}
result["input"] = inputAsString
result["output"] = simList
#print result
return json.dumps(result, sort_keys=True, indent=4, separators=(',', ': '))
if __name__ == '__main__':
import sys, os
def usage():
sys.stderr.write("""SYNOPSIS: %s [ahash|phash|dhash] [<directory>]
Identifies similar images in the directory.
Method:
ahash: Average hash
phash: Perceptual hash
dhash: Difference hash
(C) Johannes Buchner, 2013
""" % sys.argv[0])
sys.exit(1)
hashmethod = sys.argv[1] if len(sys.argv) > 1 else usage()
if hashmethod == 'ahash':
hashfunc = imagehash.average_hash
elif hashmethod == 'phash':
hashfunc = imagehash.phash
elif hashmethod == 'dhash':
hashfunc = imagehash.dhash
else:
usage()
userpath = sys.argv[2] if len(sys.argv) > 2 else "."
find_similar_images(userpath=userpath, hashfunc=hashfunc)