def bucketize(self, minhashes):
        buckets = []
        try:
            band_bits = self.dataset.band_bits
        except AttributeError:
            matrix = self.dataset
            band_bits = int(math.ceil(math.log(matrix.bands, 2)))
            band_mask = (2**band_bits - 1)
            setattr(matrix, 'band_bits', band_bits)
            setattr(matrix, 'band_mask', band_mask)
            setattr(matrix, 'hash_mask',
                    2**(settings.max_bits - band_bits) - 1)

        band_mask = self.dataset.band_mask
        hash_mask = self.dataset.hash_mask
        for band in xrange(self.dataset.bands):
            band_hash = (band_mask & band) * (hash_mask + 1)
            minhashes_in_band = [
                minhashes[band * self.rows + row] for row in xrange(self.rows)
            ]
            minhashes_into_a_string = '-'.join(
                [str(mh) for mh in minhashes_in_band])
            bucket = band_hash | (hash_mask & int(
                hashlib.md5(minhashes_into_a_string).hexdigest(), 16))
            buckets.append(DbInt.to_db(bucket))
        return buckets
 def get_nns(self, doc_id):
     doc = self.get_doc(doc_id)
     if not doc:
         return []
     bkts = [DbInt.fm_db(bkt) for bkt in doc.buckets]
     mhs = {}
     for bkt in bkts:
         bkt_docs = session.execute(self.nns_select, [self.ds_key, DbInt.to_db(bkt)])
         for bkt_doc in bkt_docs:
             mhs[bkt_doc['doc_id']] = bkt_doc['minhashes']
     del mhs[doc_id]
     jac = {}
     for doc_id2 in mhs.keys():
         jac_min = reduce(lambda x, y: x+y, map(lambda a,b: a == b, doc.minhashes,mhs[doc_id2])) / float(len(doc.minhashes))
         jac[doc_id2] = 1.0 - jac_min
         if 0 == int(1000*time.time()) % 100:
             logging.info('Sampling (1%%) Jaccard distance %s | %s: %6.2f', doc_id, doc_id2, jac[doc_id2])
     return jac
    def bucketize(self, minhashes):
        buckets = []
        try:
            band_bits = self.dataset.band_bits
        except AttributeError:
            matrix = self.dataset
            band_bits = int(math.ceil(math.log(matrix.bands, 2)))
            band_mask = (2**band_bits - 1)
            setattr(matrix, 'band_bits', band_bits)
            setattr(matrix, 'band_mask', band_mask)
            setattr(matrix, 'hash_mask', 2**(settings.max_bits - band_bits)-1)

        band_mask = self.dataset.band_mask
        hash_mask = self.dataset.hash_mask
        for band in xrange(self.dataset.bands):
            band_hash = (band_mask & band) * (hash_mask + 1)
            minhashes_in_band = [minhashes[band*self.rows + row] for row in xrange(self.rows)]
            minhashes_into_a_string = '-'.join([str(mh) for mh in minhashes_in_band])
            bucket = band_hash | (hash_mask & int(hashlib.md5(minhashes_into_a_string).hexdigest(), 16))
            buckets.append(DbInt.to_db(bucket))
        return buckets
 def get_nns(self, doc_id):
     doc = self.get_doc(doc_id)
     if not doc:
         return []
     bkts = [DbInt.fm_db(bkt) for bkt in doc.buckets]
     mhs = {}
     for bkt in bkts:
         bkt_docs = session.execute(
             self.nns_select, [self.ds_key, DbInt.to_db(bkt)])
         for bkt_doc in bkt_docs:
             mhs[bkt_doc['doc_id']] = bkt_doc['minhashes']
     del mhs[doc_id]
     jac = {}
     for doc_id2 in mhs.keys():
         jac_min = reduce(
             lambda x, y: x + y,
             map(lambda a, b: a == b, doc.minhashes, mhs[doc_id2])) / float(
                 len(doc.minhashes))
         jac[doc_id2] = 1.0 - jac_min
         if 0 == int(1000 * time.time()) % 100:
             logging.info('Sampling (1%%) Jaccard distance %s | %s: %6.2f',
                          doc_id, doc_id2, jac[doc_id2])
     return jac