def __init__(self, ids_file, redis_client, minhash_k): self.redis_client = redis_client self.hll_key_name = os.path.basename(ids_file) self.ids = set() map(lambda x: self.ids.add(x), open(ids_file, 'r').readlines()) self.minhash_set = KMinHash(minhash_k, self.redis_client, "mh" + self.hll_key_name)
class IdSet: def __init__(self, ids_file, redis_client, minhash_k): self.redis_client = redis_client self.hll_key_name = os.path.basename(ids_file) self.ids = set() map(lambda x: self.ids.add(x), open(ids_file, 'r').readlines()) self.minhash_set = KMinHash(minhash_k, self.redis_client, "mh" + self.hll_key_name) def add_to_sketches(self, minhash_strategy="mem_optimized"): self.__add_to_hll() if minhash_strategy == "pipelined": self.__add_to_kminhash_pipelined() elif minhash_strategy == "lua": self.__add_to_kminhash_lua() else: self.__add_to_kminhash_mem_optimized() def __add_to_kminhash_mem_optimized(self): for hll_id in self.ids: self.minhash_set.update_min_hash(hll_id) def __add_to_kminhash_pipelined(self): self.minhash_set.initialize() batch_size = REDIS_PIPELINE_BATCH_SIZE ids_batch = list() for hll_id in self.ids: ids_batch.append(hll_id) batch_size -= 1 if batch_size == 0: self.minhash_set.update_min_hashes_batch(ids_batch) batch_size = REDIS_PIPELINE_BATCH_SIZE ids_batch = list() self.minhash_set.update_min_hashes_batch(ids_batch) def __add_to_kminhash_lua(self): self.minhash_set.initialize() batch_size = REDIS_PIPELINE_BATCH_SIZE ids_batch = list() for hll_id in self.ids: ids_batch.append(hll_id) batch_size -= 1 if batch_size == 0: self.minhash_set.update_min_hashes_lua(ids_batch) batch_size = REDIS_PIPELINE_BATCH_SIZE ids_batch = list() self.minhash_set.update_min_hashes_lua(ids_batch) def __add_to_hll(self): self.redis_client.delete(self.hll_key_name) pipeline = self.redis_client.pipeline() batch_size = REDIS_PIPELINE_BATCH_SIZE for hll_id in self.ids: pipeline.pfadd(self.hll_key_name, hll_id) batch_size -= 1 if batch_size == 0: pipeline.execute() batch_size = REDIS_PIPELINE_BATCH_SIZE pipeline.execute() def actual_count(self): return len(self.ids) def hll_count(self): return self.redis_client.pfcount(self.hll_key_name) def intersection_count(self, other_id_set): return len(self.ids.intersection(other_id_set.ids)) def intersect_hlls_using_inclusion_exclusion(self, other_id_set): merged_key_name = common_key_name(self.hll_key_name, other_id_set.hll_key_name) self.redis_client.delete(merged_key_name) self.redis_client.pfmerge(merged_key_name, self.hll_key_name, other_id_set.hll_key_name) intersection_count = self.hll_count() + other_id_set.hll_count() - self.redis_client.pfcount(merged_key_name) return merged_key_name, intersection_count def intersect_using_kminhash(self, other_id_set): merged_key_name = common_key_name(self.hll_key_name, other_id_set.hll_key_name) self.redis_client.delete(merged_key_name) self.redis_client.pfmerge(merged_key_name, self.hll_key_name, other_id_set.hll_key_name) union_count = self.redis_client.pfcount(merged_key_name) jaccard_coefficient = self.minhash_set.estimate_jaccard_coefficient(other_id_set.minhash_set) return int(jaccard_coefficient * union_count)