Beispiel #1
0
class Hasher:
    def __init__(self, **options):
        self.signer = MinHashSignature(options.get("numHashes",100))
        self.hasher = LSH(options.get("numHashes",100),options.get("numItemsInBand",10), None)
        self.save_min_hash = options.get("saveMinhashes",False)
        print 'in hashing'
        print options.get("numHashes",100),options.get("numsItemsInBand",10),options.get("saveMinhashes",False)


    def perform(self,rdd):
        rdd = rdd.map(lambda (x,y) : (x,json.dumps(y)))
        return self.compute_hashes(rdd)

    def compute_hashes(self, data):
        return data.flatMap(lambda (x, y): self.compute_row_lsh(x, y))

    def compute_row_lsh(self, key, row):
        if len(row) > 0:
            #print "Sign:", row
            min_hash_sig = self.signer.sign(row)
            #print min_hash_sig
            if min_hash_sig is not None:
                lsh_sig = list(self.hasher.hash(min_hash_sig))
                if self.save_min_hash is False:
                    min_hash_sig = None
                for lsh_val in lsh_sig:
                    yield lsh_val, (key, min_hash_sig)
class Hasher:
    def __init__(self, num_hashes, num_items_in_band, save_min_hash):
        self.signer = MinHashSignature(num_hashes)
        self.hasher = LSH(num_hashes, num_items_in_band, None)
        self.save_min_hash = save_min_hash
        pass

    def compute_hashes(self, data):
        return data.flatMap(lambda (x, y): self.compute_row_lsh(x, y))

    def compute_row_lsh(self, key, row):
        if len(row) > 0:
            #print "Sign:", row
            min_hash_sig = self.signer.sign(row)
            if min_hash_sig is not None:
                lsh_sig = list(self.hasher.hash(min_hash_sig))
                if self.save_min_hash is False:
                    min_hash_sig = None
                for lsh_val in lsh_sig:
                    yield lsh_val, (key, min_hash_sig)
 def __init__(self, num_hashes, num_items_in_band, save_min_hash):
     self.signer = MinHashSignature(num_hashes)
     self.hasher = LSH(num_hashes, num_items_in_band, None)
     self.save_min_hash = save_min_hash
     pass
Beispiel #4
0
 def __init__(self, **options):
     self.signer = MinHashSignature(options.get("numHashes",100))
     self.hasher = LSH(options.get("numHashes",100),options.get("numItemsInBand",10), None)
     self.save_min_hash = options.get("saveMinhashes",False)
     print 'in hashing'
     print options.get("numHashes",100),options.get("numsItemsInBand",10),options.get("saveMinhashes",False)
Beispiel #5
0
]

images = []
for imgNames in imageNames:
    images.append(getImageData(imgNames))

alphaIgnoredImages = []
for img in images:
    imgWithoutAlpha = img[:,:,0:3]
    # print("imgWithoutAlpha", imgWithoutAlpha.shape)
    alphaIgnoredImages.append(imgWithoutAlpha)

# print("images", images, images[0].shape,  images[1].shape, images[2].shape)
# print("alphaIgnoredImages", alphaIgnoredImages)

reshapedImages = []
for img in alphaIgnoredImages:
    reshapedImg = img.reshape(1,-1)
    reshapedImages.append(reshapedImg)

print("reshapedImages", reshapedImages, "dimension", reshapedImages[0].shape[1])

lshModel = LSH(noOfHashers=25, noOfHash=10, dimension=reshapedImages[0].shape[1])

for i in range(0, len(reshapedImages)):
    lshModel.train(reshapedImages[i], { "name": imageNames[i] })

print(lshModel.isSimilar(reshapedImages[0], reshapedImages[1]))
print(lshModel.isSimilar(reshapedImages[0], reshapedImages[2]))
print(lshModel.isSimilar(reshapedImages[1], reshapedImages[2]))
print(lshModel.isSimilar(reshapedImages[2], reshapedImages[3]))
from sklearn.feature_extraction.text import CountVectorizer
from lsh.lsh import LSH
import numpy as np

texts = [
    'Jack went to the market to buy some fruits',
    'Jane went to the market to buy some fruits today',
    'Robert and his team played hockey today'
]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(texts).toarray().reshape(len(texts), 1, -1)

lshModel = LSH(noOfHashers=25, noOfHash=3, dimension=X.shape[2])

for i in range(0, X.shape[0]):
    lshModel.train(X[i], {"name": texts[i]})

print(lshModel.isSimilar(X[0], X[1]))
print(lshModel.isSimilar(X[0], X[2]))
print(lshModel.isSimilar(X[1], X[2]))