def __init__( self, column_name, minhash_size=256, minhash_seed=43, hyperloglog_p=8, sample_size=100, enable_word_vector_data=False, model=WordVectorModel, ): self._column_name = column_name self._sample = set([]) self._sample_size = sample_size self._count = 0 self._empty_count = 0 self._oov_count = 0 self._numeric_count = 0 self._minhash = MinHash(num_perm=minhash_size, seed=minhash_seed, hashfunc=self._hashfunc32) self._hhl = HyperLogLogPlusPlus(p=hyperloglog_p, hashfunc=self._hashfunc64) self._enabled_word_vec_data = enable_word_vector_data self._model = model self._sum_vector = self._model.get_empty_word_vector()
def datasketch_hllpp(db, words, unique_words_len, p): print('Datasketch Module: HLL++ __________ p = {}'.format(p)) hpp = HyperLogLogPlusPlus(p=p) # unique_words = np.array([]) # unique_words = [] collection = db.datasketch_hllpp collection.delete_many({}) start = time.time() for word in words: # Digest the hash object to get the hash value hv = hpp.hashfunc(word.encode('utf8')) # Get the index of the register using the first p bits of the hash reg_index = hv & (hpp.m - 1) # If hash not 0, word is unique if not hpp.reg[reg_index]: # unique_words = np.append(unique_words, word) collection.insert_one({'word': word}) # Get the rest of the hash bits = hv >> hpp.p # Update the register hpp.reg[reg_index] = max(hpp.reg[reg_index], hpp._get_rank(bits)) end = time.time() # count = hpp.count() count = collection.count_documents({}) print('[datasketch HLL++] Time HLL: {}'.format(end - start)) print('[datasketch HLL++] HLL Number of words: {}'.format(count)) print('[datasketch HLL++] HLL counting error: {}%'.format(round((float(count)/unique_words_len)*100 - 100, 2))) size_hll = asizeof.asizeof(hpp) # size_hll_words = asizeof.asizeof(unique_words) print('[datasketch HLL++] Size of HLL++: {} Mb, {} Kb'.format(size_hll/1024/1024, size_hll/1024)) # print('[datasketch HLL++] Size of array in HLL++: {} Mb, {} Kb'.format(size_hll_words/1024/1024, size_hll_words/1024)) # print('[datasketch HLL++] Size of HLL++ total: {} Mb, {} Kb'.format((size_hll+size_hll_words)/1024/1024, (size_hll+size_hll_words)/1024)) print('________________________________________________________________\n')
def get_hyperloglog_pp(data: list) -> HyperLogLogPlusPlus: h_loglog_pp = HyperLogLogPlusPlus() for d in data: h_loglog_pp.update(d.encode('utf8')) return h_loglog_pp
def __init__(self, hash_type=None, bits=None, hash_func=None, params=None): self.hash_type = hash_type self.hash_func = hash_func self.hash_bits = bits self.hashfunc = sha1_hash32 if self.hash_bits in {32, "32", None}: if self.hash_func == "mmh3": self.hashfunc = mmh3.hash elif self.hash_func == "farmhash": self.hashfunc = farmhash.hash32 elif self.hash_func == "xxhash": self.hashfunc = xxhash.xxh32 else: # "hash32","default": self.hashfunc = sha1_hash32 elif self.hash_bits in {64, "64"}: if self.hash_func == "mmh3": self.hashfunc = mmh3.hash64 elif self.hash_func == "farmhash": self.hashfunc = farmhash.hash64 elif self.hash_func == "xxhash": self.hashfunc = xxhash.xxh64 else: self.hashfunc = sha1_hash64 elif self.hash_bits in {128, "128"}: if self.hash_func == "mmh3": self.hashfunc = mmh3.hash128 elif self.hash_func == "farmhash": self.hashfunc = farmhash.hash128 else: raise ValueError("请检查对应的hash函数类型与位数") else: raise ValueError("请检查对应的hash函数的位数") if not params: params = {} """ 若只用redis 作为存储截止 配置 storage_config={ 'type': 'redis', 'redis': {'host': 'localhost', 'port': 6379}, } 要顺序插入大量MinHash,建议使用插入会话。这样可以减少批量插入过程中的网络呼叫数量。 data_list = [("m1", m1), ("m2", m2), ("m3", m3)] with lsh.insertion_session() as session: for key, minhash in data_list: session.insert(key, minhash) 请注意,在打开插入会话期间查询LSH对象可能会导致不一致。 MinHash LSH还支持Cassandra群集作为存储层。为您的LSH使用长期存储可解决应用程序需要不断更新LSH对象的所有用例(例如, 当您使用MinHashLSH逐步对文档进行群集时)。 Cassandra存储选项可以配置如下: storage_config={ 'type': 'cassandra', 'cassandra': { 'seeds': ['127.0.0.1'], 'keyspace': 'lsh_test', 'replication': { 'class': 'SimpleStrategy', 'replication_factor': '1', }, 'drop_keyspace': False, 'drop_tables': False, }} 参数Seeds指定可以联系以连接到Cassandra集群的种子节点列表。选项键空间和复制指定创建键空间(如果尚不存在)时要使用的参数。 如果要强制创建表或键空间(因此要删除现有表或键空间),请将drop_tables和drop_keyspace选项设置为 True。 像Redis副本一样,建议使用插入会话来减少批量插入期间的网络调用数量。 +-----------------------连接到现有的最小哈希LSH-------------------------------------+ 如果您的LSH使用外部存储层(例如Redis),则可以跨多个进程共享它。有两种方法可以做到这一点: 推荐的方法是使用“酸洗”。MinHash LSH对象是可序列化的,因此您可以调用pickle: import pickle # Create your LSH object lsh = ... # Serialize the LSH data = pickle.dumps(lsh) # Now you can pass it as an argument to a forked process or simply save it # in an external storage. # In a different process, deserialize the LSH lsh = pickle.loads(data) 使用pickle,您可以保存有关LSH所需的所有知识,例如在一个位置中进行各种参数设置。 另外,您可以在首次创建LSH时在存储配置中指定基本名称。例如: # For Redis. lsh = MinHashLSH( threshold=0.5, num_perm=128, storage_config={ 'type': 'redis', 'basename': b'unique_name_6ac4fg', 'redis': {'host': 'localhost', 'port': 6379}, } ) # For Cassandra. lsh = MinHashLSH( threashold=0.5, num_perm=128, storage_config={ 'type': 'cassandra', 'basename': b'unique_name', 'cassandra': { 'seeds': ['127.0.0.1'], 'keyspace': 'lsh_test', 'replication': { 'class': 'SimpleStrategy', 'replication_factor': '1', }, 'drop_keyspace': False, 'drop_tables': False, } } ) 的基名将用于生成在所述存储层中唯一地标识与该LSH相关联的数据键前缀。因此,如果使用相同的基名创建新的LSH对象,则将在与旧LSH关联的存储层中使用相同的基础数据。 如果不指定basename,则MinHash LSH将生成一个随机字符串作为基本名称,并且极不可能发生冲突。 更详细的使用见 文档 :http://ekzhu.com/datasketch/lsh.html """ if self.hash_type in {"minhash", "MinHash"}: # 主要计算Jaccard 的相似度, 使用较小的固定存储空间来估计线性时间内任意大小的集合之间的jaccard 相似度 self.hash = MinHash( num_perm=params.get( "num_perm", 128), # int可选项, 如果hashvalues值不是None,则被忽略。随机排列函数的数量 # 用来控制hash 的精度 seed=params.get("seed", 1), # 随机种子 可选 hashfunc=self. hashfunc, # 可选 使用的hash函数,将输入传递给update 方法。并返回一个可以用32位编码的整数 hashobj=params.get("hashobj", None), # Deprecated.已经被hashfunc 代替 hashvalues=params.get("hashvalues", None), # 可选 数组或列表 permutations=params.get( "permutations", None)) # 置换函数参数, 可选,可使用另一个Minhash 的现有状态来指定此参数进行快速的初始化 elif self.hash_type in { "weightedminhashlsh", "mhlsh", "WeightedMinHashLSH", "wmhlsh", "MinHashLSH" }: # 加权的最小哈希局部敏感哈希 # WeightedMinHashLSH() 与 MinHashLSH 等价 。 加权jaccard 相似度 查询 # 不支持top-k查询, 但minhashlshforest 支持top-k self.hash = MinHashLSH( threshold=params.get("threshold", 0.9), # 杰卡德距离的阈值 num_perm=params.get("num_perm", 128), # 置换函数设定个数, 在加权minihash 上的 样本规模大小 weights=params.get("weights", (0.5, 0.5)), # 元组, 可选项, 优化jaccard阈值 params=params.get("params", None), # 元组,可选项, – bands 的数量与规模大小 storage_config=params.get("storage_config", None), # 存储配置 prepickle=params.get("prepickle", None)) # 默认使用pk格式存储 elif self.hash_type in {"leanminhash", "lmh", "LeanMinHash", "LMH"}: # 相比MinHash 中,内存更小的哈希。 self.hash = LeanMinHash(minhash=params.get("minhash", None), seed=params.get("seed", None), hashvalues=params.get("hashvalues", None)) elif self.hash_type in { "MinHashLSHForest", "minhashlshforest", "mhlshf", "MHLSHF" }: self.hash = MinHashLSHForest(num_perm=params.get("num_perm", 128), l=params.get("l", 8)) elif self.hash_type in { "MinHashLSHEnsemble", "MinHashLSHEnsemble", "mhlshe", "MHLSHE" }: # 使用新距离做的minhashlsh操作 , 即使用Containment 中文简称为遏制 self.hash = MinHashLSHEnsemble( threshold=params.get("threshold", 0.9), num_perm=params.get("num_perm", 128), num_part=params.get("num_part", 16), # m=params.get("m", 8), weights=params.get("weights", (0.5, 0.5)), storage_config=params.get("storage_config", None), prepickle=params.get("prepickle", None)) elif self.hash_type in {"HyperLogLog", "hyperloglog", "hll", "HLL"}: # 相关的接口与HyperLogLog 相同 # HyperLogLog能够使用较小且固定的内存空间,单次估算数据集的基数(不同值的数量) self.hash = HyperLogLog( p=params.get("p", 8), # 与MinHash 中的数据相比较,num_perm 用于控制精度 reg=params.get("reg", None), hashfunc=params.get("hashfunc", sha1_hash32), # 内部使用的hash 算法 hashobj=params.get("hashobj", None)) # 可选 数组或列表, 使用hashfunc 代替了 elif self.hash_type in { "hyperloglogplusplus", "HyperLogLogPlusPlus", "HyperLogLog++", "hyperlogkog++", "HLLPP", "hllpp", "HLL++", "hll++" }: # 相关的接口与HyperLogLog 相同 self.hash = HyperLogLogPlusPlus( p=params.get("p", 8), reg=params.get("reg", None), hashfunc=params.get("hashfunc", sha1_hash64), # 使用的64位的hash 算法 hashobj=params.get("hashobj", None)) else: raise ValueError("请选择正确的函数函数对象")
# Create an LSH Ensemble index with a threshold lshensemble = MinHashLSHEnsemble(threshold=0.8, num_perm=128) # Index takes an iterable of (key, minhash, size) lshensemble.index([("m2", m2, len(set2)), ("m3", m3, len(set3))]) # Check for membership using the key print("m2" in lshensemble) print("m3" in lshensemble) # Using m1 as the query, get an result iterator print("Sets with containment > 0.2:") for key in lshensemble.query(m1, len(set1)): print(key) from datasketch import HyperLogLog, HyperLogLogPlusPlus data1 = [ 'hyperloglog', 'is', 'a', 'probabilistic', 'data', 'structure', 'for', 'estimating', 'the', 'cardinality', 'of', 'dataset', 'dataset', 'a' ] h = HyperLogLogPlusPlus(p=12) for d in data1: h.update(d.encode('utf8')) print("Estimated cardinality is", h.count()) s1 = set(data1) print("Actual cardinality is", len(s1))
def _get_estimator_instance(self) -> object: """ create HyperLogLogPlusPlus cardinality estimator class instance :return: """ return HyperLogLogPlusPlus(p=self.hyperloglog_accuracy)
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ """ it is test of external library for import hyperloglog https://github.com/svpcom/hyperloglog for from datasketch import HyperLogLogPlusPlus https://github.com/ekzhu/datasketch """ import os from sys import path # import hyperloglog from datasketch import HyperLogLogPlusPlus path.insert( 0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../common'))) from file_operations import read_array_file if __name__ == '__main__': arr = read_array_file('./hyperloglog/data10.txt', True) # hll = hyperloglog.HyperLogLog(0.01) # accept 1% counting error hll = HyperLogLogPlusPlus(p=16) cnt = len(arr) print('count = {}; distinct = {}'.format(cnt, 2103130)) for i in arr: hll.update(str(i).encode('utf8')) # hll.add(str(i)) print(hll.count()) print(hll.digest([])) # print(len(hll))
_cln_value: 2 }) InputDataSet = pd.DataFrame(inp) # calc hll hll_dict = {} for index, row in InputDataSet.iterrows(): key = '{}_{}'.format(row[_cln_end_date].strftime("%d/%m/%Y"), row[_cln_host_id]) print(key) if key in hll_dict: print('in') hll = hll_dict[key][_cln_hll] else: print('not') hll = HyperLogLogPlusPlus(p=12) # max p=16 # if row[_cln_hll]: # init from exist hash # _arr= np.fromstring(digest, dtype=int, sep=" ") # hll = HyperLogLogPlusPlus(reg= _arr) hll_dict[key] = { _cln_hll: hll, _cln_end_date: row[_cln_end_date], _cln_host_id: row[_cln_host_id] } hll.update(str(row[_cln_value]).encode('utf8')) out = [] # prepare output for key, value in hll_dict.items(): hll = value[_cln_hll] buf = bytearray(hll.bytesize())
class ColumnSketch: """A Column Sketch contains a summary of a table column. Args: column_name: the extracted column name. minhash_size: the number of permutations to use for MinHash. minhash_seed: the random seed used by MinHash. hyperloglog_p: the precision parameter used by HyperLogLog. sample_size: the size of sample to be kept. enable_word_vector_data: whether to build word embedding vector for data values -- can be 10x more expensive. """ def __init__( self, column_name, minhash_size=256, minhash_seed=43, hyperloglog_p=8, sample_size=100, enable_word_vector_data=False, model=WordVectorModel, ): self._column_name = column_name self._sample = set([]) self._sample_size = sample_size self._count = 0 self._empty_count = 0 self._oov_count = 0 self._numeric_count = 0 self._minhash = MinHash(num_perm=minhash_size, seed=minhash_seed, hashfunc=self._hashfunc32) self._hhl = HyperLogLogPlusPlus(p=hyperloglog_p, hashfunc=self._hashfunc64) self._enabled_word_vec_data = enable_word_vector_data self._model = model self._sum_vector = self._model.get_empty_word_vector() def _hashfunc32(self, str_value): return farmhash.hash32(str_value) def _hashfunc64(self, str_value): return farmhash.hash64(str_value) @property def column_name(self): """The extracted column name. """ return self._column_name @property def sample(self): """A sample (non-random) of the data values in the column as a list. """ return list(self._sample) @property def count(self): """The total number of data values (i.e. rows) including the empty ones. """ return self._count @property def empty_count(self): """The number of empty data values. """ return self._empty_count @property def non_empty_count(self): """The number of non-empty data values. """ return self._count - self._empty_count @property def out_of_vocabulary_count(self): """The number of data values that are non-empty and outside of the language model's vocabulary. """ return self._oov_count @property def in_vocabulary_count(self): """The number of data values that are non-empty and in the language model's vocabulary. """ return self._count - self._empty_count - self._oov_count @property def numeric_count(self): """The number of data values that are non-empty and numerical. """ return self._numeric_count @property def is_numeric(self): """Whether the column is numeric, based on if at least 50% of rows are numeric. """ if self.non_empty_count == 0: return False return (float(self._numeric_count) / float(self.non_empty_count)) >= 0.5 @property def distinct_count(self): """The approximate distinct count made by the HyperLogLog. """ if len(self._sample) < self._sample_size: return len(self._sample) return max(len(self._sample), self._hhl.count()) @property def word_vector_column_name(self): """The word embedding vector of the column name as a list. """ doc = self._model.process(self.column_name) vectors = [token.vector for token in doc if token.has_vector] if len(vectors) == 0: return None return list(float(v) for v in np.sum(vectors, axis=0)) @property def word_vector_data(self): """The mean word embedding vector of all data values as a list. """ if not self._enabled_word_vec_data: return None if self.in_vocabulary_count == 0: return None vector = self._sum_vector / np.float32(self.in_vocabulary_count) return list(float(v) for v in vector) @property def minhash(self): """The hash values in the MinHash. """ return list(int(v) for v in self._minhash.digest()) @property def seed(self): """The random seed used for MinHash. """ return self._minhash.seed @property def hyperloglog(self): """The register values of the HyperLogLog counter. """ return list(int(v) for v in self._hhl.digest()) def update(self, value): """Add a data value into the sketch. """ # Update counter. self._count += 1 if not isinstance(value, str): value = json.dumps(value, sort_keys=True) # Clean the value value = value.strip().lower() # Skip if the value is empty string. if len(value) == 0: self._empty_count += 1 return if _is_number(value): self._numeric_count += 1 # Add to sample. if len(self._sample) < self._sample_size: self._sample.add(value) # Update the MinHash sketch. self._minhash.update(value) # Update the HyperLogLog sketch. self._hhl.update(value) # Skip word vector extraction if not enabled. if not self._enabled_word_vec_data: return # Update the sum of word embeddings. vectors = [ token.vector for token in self._model.process(value) if token.has_vector ] if len(vectors) > 0: self._sum_vector += np.sum(vectors, axis=0) else: self._oov_count += 1