def bucketize(self, minhashes): buckets = [] try: band_bits = self.dataset.band_bits except AttributeError: matrix = self.dataset band_bits = int(math.ceil(math.log(matrix.bands, 2))) band_mask = (2**band_bits - 1) setattr(matrix, 'band_bits', band_bits) setattr(matrix, 'band_mask', band_mask) setattr(matrix, 'hash_mask', 2**(settings.max_bits - band_bits) - 1) band_mask = self.dataset.band_mask hash_mask = self.dataset.hash_mask for band in xrange(self.dataset.bands): band_hash = (band_mask & band) * (hash_mask + 1) minhashes_in_band = [ minhashes[band * self.rows + row] for row in xrange(self.rows) ] minhashes_into_a_string = '-'.join( [str(mh) for mh in minhashes_in_band]) bucket = band_hash | (hash_mask & int( hashlib.md5(minhashes_into_a_string).hexdigest(), 16)) buckets.append(DbInt.to_db(bucket)) return buckets
def get_nns(self, doc_id): doc = self.get_doc(doc_id) if not doc: return [] bkts = [DbInt.fm_db(bkt) for bkt in doc.buckets] mhs = {} for bkt in bkts: bkt_docs = session.execute(self.nns_select, [self.ds_key, DbInt.to_db(bkt)]) for bkt_doc in bkt_docs: mhs[bkt_doc['doc_id']] = bkt_doc['minhashes'] del mhs[doc_id] jac = {} for doc_id2 in mhs.keys(): jac_min = reduce(lambda x, y: x+y, map(lambda a,b: a == b, doc.minhashes,mhs[doc_id2])) / float(len(doc.minhashes)) jac[doc_id2] = 1.0 - jac_min if 0 == int(1000*time.time()) % 100: logging.info('Sampling (1%%) Jaccard distance %s | %s: %6.2f', doc_id, doc_id2, jac[doc_id2]) return jac
def get_nns(self, doc_id): doc = self.get_doc(doc_id) if not doc: return [] bkts = [DbInt.fm_db(bkt) for bkt in doc.buckets] mhs = {} for bkt in bkts: bkt_docs = session.execute( self.nns_select, [self.ds_key, DbInt.to_db(bkt)]) for bkt_doc in bkt_docs: mhs[bkt_doc['doc_id']] = bkt_doc['minhashes'] del mhs[doc_id] jac = {} for doc_id2 in mhs.keys(): jac_min = reduce( lambda x, y: x + y, map(lambda a, b: a == b, doc.minhashes, mhs[doc_id2])) / float( len(doc.minhashes)) jac[doc_id2] = 1.0 - jac_min if 0 == int(1000 * time.time()) % 100: logging.info('Sampling (1%%) Jaccard distance %s | %s: %6.2f', doc_id, doc_id2, jac[doc_id2]) return jac
def bucketize(self, minhashes): buckets = [] try: band_bits = self.dataset.band_bits except AttributeError: matrix = self.dataset band_bits = int(math.ceil(math.log(matrix.bands, 2))) band_mask = (2**band_bits - 1) setattr(matrix, 'band_bits', band_bits) setattr(matrix, 'band_mask', band_mask) setattr(matrix, 'hash_mask', 2**(settings.max_bits - band_bits)-1) band_mask = self.dataset.band_mask hash_mask = self.dataset.hash_mask for band in xrange(self.dataset.bands): band_hash = (band_mask & band) * (hash_mask + 1) minhashes_in_band = [minhashes[band*self.rows + row] for row in xrange(self.rows)] minhashes_into_a_string = '-'.join([str(mh) for mh in minhashes_in_band]) bucket = band_hash | (hash_mask & int(hashlib.md5(minhashes_into_a_string).hexdigest(), 16)) buckets.append(DbInt.to_db(bucket)) return buckets