Beispiel #1
0
class Hasher:
    def __init__(self, **options):
        self.signer = MinHashSignature(options.get("numHashes",100))
        self.hasher = LSH(options.get("numHashes",100),options.get("numItemsInBand",10), None)
        self.save_min_hash = options.get("saveMinhashes",False)
        print 'in hashing'
        print options.get("numHashes",100),options.get("numsItemsInBand",10),options.get("saveMinhashes",False)


    def perform(self,rdd):
        rdd = rdd.map(lambda (x,y) : (x,json.dumps(y)))
        return self.compute_hashes(rdd)

    def compute_hashes(self, data):
        return data.flatMap(lambda (x, y): self.compute_row_lsh(x, y))

    def compute_row_lsh(self, key, row):
        if len(row) > 0:
            #print "Sign:", row
            min_hash_sig = self.signer.sign(row)
            #print min_hash_sig
            if min_hash_sig is not None:
                lsh_sig = list(self.hasher.hash(min_hash_sig))
                if self.save_min_hash is False:
                    min_hash_sig = None
                for lsh_val in lsh_sig:
                    yield lsh_val, (key, min_hash_sig)
class Hasher:
    def __init__(self, num_hashes, num_items_in_band, save_min_hash):
        self.signer = MinHashSignature(num_hashes)
        self.hasher = LSH(num_hashes, num_items_in_band, None)
        self.save_min_hash = save_min_hash
        pass

    def compute_hashes(self, data):
        return data.flatMap(lambda (x, y): self.compute_row_lsh(x, y))

    def compute_row_lsh(self, key, row):
        if len(row) > 0:
            #print "Sign:", row
            min_hash_sig = self.signer.sign(row)
            if min_hash_sig is not None:
                lsh_sig = list(self.hasher.hash(min_hash_sig))
                if self.save_min_hash is False:
                    min_hash_sig = None
                for lsh_val in lsh_sig:
                    yield lsh_val, (key, min_hash_sig)