Exemple #1
0
 def __init__(
     self,
     column_name,
     minhash_size=256,
     minhash_seed=43,
     hyperloglog_p=8,
     sample_size=100,
     enable_word_vector_data=False,
     model=WordVectorModel,
 ):
     self._column_name = column_name
     self._sample = set([])
     self._sample_size = sample_size
     self._count = 0
     self._empty_count = 0
     self._oov_count = 0
     self._numeric_count = 0
     self._minhash = MinHash(num_perm=minhash_size,
                             seed=minhash_seed,
                             hashfunc=self._hashfunc32)
     self._hhl = HyperLogLogPlusPlus(p=hyperloglog_p,
                                     hashfunc=self._hashfunc64)
     self._enabled_word_vec_data = enable_word_vector_data
     self._model = model
     self._sum_vector = self._model.get_empty_word_vector()
def datasketch_hllpp(db, words, unique_words_len, p):
    print('Datasketch Module: HLL++ __________ p = {}'.format(p))
    hpp = HyperLogLogPlusPlus(p=p)
    # unique_words = np.array([])
    # unique_words = []
    collection = db.datasketch_hllpp
    collection.delete_many({})

    start = time.time()
    for word in words:
        # Digest the hash object to get the hash value
        hv = hpp.hashfunc(word.encode('utf8'))
        # Get the index of the register using the first p bits of the hash
        reg_index = hv & (hpp.m - 1)
        # If hash not 0, word is unique
        if not hpp.reg[reg_index]:
            # unique_words = np.append(unique_words, word)
            collection.insert_one({'word': word})
        # Get the rest of the hash
        bits = hv >> hpp.p
        # Update the register
        hpp.reg[reg_index] = max(hpp.reg[reg_index], hpp._get_rank(bits))
    end = time.time()

    # count = hpp.count()
    count = collection.count_documents({})
    print('[datasketch HLL++] Time HLL: {}'.format(end - start))
    print('[datasketch HLL++] HLL Number of words: {}'.format(count))
    print('[datasketch HLL++] HLL counting error: {}%'.format(round((float(count)/unique_words_len)*100 - 100, 2)))
    size_hll = asizeof.asizeof(hpp)
    # size_hll_words = asizeof.asizeof(unique_words)
    print('[datasketch HLL++] Size of HLL++: {} Mb, {} Kb'.format(size_hll/1024/1024, size_hll/1024))
    # print('[datasketch HLL++] Size of array in HLL++: {} Mb, {} Kb'.format(size_hll_words/1024/1024, size_hll_words/1024))
    # print('[datasketch HLL++] Size of HLL++ total: {} Mb, {} Kb'.format((size_hll+size_hll_words)/1024/1024, (size_hll+size_hll_words)/1024))
    print('________________________________________________________________\n')
Exemple #3
0
def get_hyperloglog_pp(data: list) -> HyperLogLogPlusPlus:
    h_loglog_pp = HyperLogLogPlusPlus()
    for d in data:
        h_loglog_pp.update(d.encode('utf8'))
    return h_loglog_pp
    def __init__(self, hash_type=None, bits=None, hash_func=None, params=None):
        self.hash_type = hash_type
        self.hash_func = hash_func
        self.hash_bits = bits
        self.hashfunc = sha1_hash32
        if self.hash_bits in {32, "32", None}:
            if self.hash_func == "mmh3":
                self.hashfunc = mmh3.hash
            elif self.hash_func == "farmhash":
                self.hashfunc = farmhash.hash32
            elif self.hash_func == "xxhash":
                self.hashfunc = xxhash.xxh32
            else:
                # "hash32","default":
                self.hashfunc = sha1_hash32

        elif self.hash_bits in {64, "64"}:
            if self.hash_func == "mmh3":
                self.hashfunc = mmh3.hash64
            elif self.hash_func == "farmhash":
                self.hashfunc = farmhash.hash64
            elif self.hash_func == "xxhash":
                self.hashfunc = xxhash.xxh64
            else:
                self.hashfunc = sha1_hash64

        elif self.hash_bits in {128, "128"}:
            if self.hash_func == "mmh3":
                self.hashfunc = mmh3.hash128
            elif self.hash_func == "farmhash":
                self.hashfunc = farmhash.hash128
            else:
                raise ValueError("请检查对应的hash函数类型与位数")

        else:
            raise ValueError("请检查对应的hash函数的位数")

        if not params:
            params = {}
        """
        若只用redis 作为存储截止
        配置
        storage_config={  
        'type': 'redis',
        'redis': {'host': 'localhost', 'port': 6379},
        }
                
        要顺序插入大量MinHash,建议使用插入会话。这样可以减少批量插入过程中的网络呼叫数量。
        data_list = [("m1", m1), ("m2", m2), ("m3", m3)]
        with lsh.insertion_session() as session:
            for key, minhash in data_list:
                session.insert(key, minhash)
        请注意,在打开插入会话期间查询LSH对象可能会导致不一致。
        
        MinHash LSH还支持Cassandra群集作为存储层。为您的LSH使用长期存储可解决应用程序需要不断更新LSH对象的所有用例(例如,
        当您使用MinHashLSH逐步对文档进行群集时)。
        Cassandra存储选项可以配置如下:
        
         storage_config={
        'type': 'cassandra',
        'cassandra': {
            'seeds': ['127.0.0.1'],
            'keyspace': 'lsh_test',
            'replication': {
                'class': 'SimpleStrategy',
                'replication_factor': '1',
            },
            'drop_keyspace': False,
            'drop_tables': False,
        }}
        参数Seeds指定可以联系以连接到Cassandra集群的种子节点列表。选项键空间和复制指定创建键空间(如果尚不存在)时要使用的参数。
        如果要强制创建表或键空间(因此要删除现有表或键空间),请将drop_tables和drop_keyspace选项设置为 True。
        像Redis副本一样,建议使用插入会话来减少批量插入期间的网络调用数量。
        
        +-----------------------连接到现有的最小哈希LSH-------------------------------------+ 
        如果您的LSH使用外部存储层(例如Redis),则可以跨多个进程共享它。有两种方法可以做到这一点:
        
        推荐的方法是使用“酸洗”。MinHash LSH对象是可序列化的,因此您可以调用pickle:
        
        import pickle
        
        # Create your LSH object
        lsh = ...
        # Serialize the LSH
        data = pickle.dumps(lsh)
        # Now you can pass it as an argument to a forked process or simply save it
        # in an external storage.
        
        # In a different process, deserialize the LSH
        lsh = pickle.loads(data)
        使用pickle,您可以保存有关LSH所需的所有知识,例如在一个位置中进行各种参数设置。
        另外,您可以在首次创建LSH时在存储配置中指定基本名称。例如:

        # For Redis.
        lsh = MinHashLSH(
            threshold=0.5, num_perm=128, storage_config={
                'type': 'redis',
                'basename': b'unique_name_6ac4fg',
                'redis': {'host': 'localhost', 'port': 6379},
            }
        )
        
         # For Cassandra.
         lsh = MinHashLSH(
            threashold=0.5, num_perm=128, storage_config={
                'type': 'cassandra',
                'basename': b'unique_name',
                'cassandra': {
                    'seeds': ['127.0.0.1'],
                    'keyspace': 'lsh_test',
                    'replication': {
                        'class': 'SimpleStrategy',
                        'replication_factor': '1',
                    },
                    'drop_keyspace': False,
                    'drop_tables': False,
                }
            }
        )
        的基名将用于生成在所述存储层中唯一地标识与该LSH相关联的数据键前缀。因此,如果使用相同的基名创建新的LSH对象,则将在与旧LSH关联的存储层中使用相同的基础数据。
        
        如果不指定basename,则MinHash LSH将生成一个随机字符串作为基本名称,并且极不可能发生冲突。
        
        更详细的使用见 文档 :http://ekzhu.com/datasketch/lsh.html
        """

        if self.hash_type in {"minhash", "MinHash"}:
            # 主要计算Jaccard 的相似度, 使用较小的固定存储空间来估计线性时间内任意大小的集合之间的jaccard 相似度
            self.hash = MinHash(
                num_perm=params.get(
                    "num_perm",
                    128),  # int可选项, 如果hashvalues值不是None,则被忽略。随机排列函数的数量
                # 用来控制hash 的精度
                seed=params.get("seed", 1),  # 随机种子 可选
                hashfunc=self.
                hashfunc,  # 可选 使用的hash函数,将输入传递给update 方法。并返回一个可以用32位编码的整数
                hashobj=params.get("hashobj",
                                   None),  # Deprecated.已经被hashfunc 代替
                hashvalues=params.get("hashvalues", None),  # 可选 数组或列表
                permutations=params.get(
                    "permutations",
                    None))  # 置换函数参数, 可选,可使用另一个Minhash 的现有状态来指定此参数进行快速的初始化
        elif self.hash_type in {
                "weightedminhashlsh", "mhlsh", "WeightedMinHashLSH", "wmhlsh",
                "MinHashLSH"
        }:  # 加权的最小哈希局部敏感哈希
            #  WeightedMinHashLSH()   与 MinHashLSH 等价  。 加权jaccard 相似度 查询
            # 不支持top-k查询, 但minhashlshforest 支持top-k
            self.hash = MinHashLSH(
                threshold=params.get("threshold", 0.9),  # 杰卡德距离的阈值
                num_perm=params.get("num_perm",
                                    128),  # 置换函数设定个数, 在加权minihash 上的 样本规模大小
                weights=params.get("weights",
                                   (0.5, 0.5)),  # 元组, 可选项, 优化jaccard阈值
                params=params.get("params", None),  # 元组,可选项, – bands 的数量与规模大小
                storage_config=params.get("storage_config", None),  # 存储配置
                prepickle=params.get("prepickle", None))  # 默认使用pk格式存储
        elif self.hash_type in {"leanminhash", "lmh", "LeanMinHash", "LMH"}:
            # 相比MinHash 中,内存更小的哈希。
            self.hash = LeanMinHash(minhash=params.get("minhash", None),
                                    seed=params.get("seed", None),
                                    hashvalues=params.get("hashvalues", None))

        elif self.hash_type in {
                "MinHashLSHForest", "minhashlshforest", "mhlshf", "MHLSHF"
        }:
            self.hash = MinHashLSHForest(num_perm=params.get("num_perm", 128),
                                         l=params.get("l", 8))

        elif self.hash_type in {
                "MinHashLSHEnsemble", "MinHashLSHEnsemble", "mhlshe", "MHLSHE"
        }:
            # 使用新距离做的minhashlsh操作 , 即使用Containment 中文简称为遏制
            self.hash = MinHashLSHEnsemble(
                threshold=params.get("threshold", 0.9),
                num_perm=params.get("num_perm", 128),
                num_part=params.get("num_part", 16),  #
                m=params.get("m", 8),
                weights=params.get("weights", (0.5, 0.5)),
                storage_config=params.get("storage_config", None),
                prepickle=params.get("prepickle", None))

        elif self.hash_type in {"HyperLogLog", "hyperloglog", "hll", "HLL"}:
            # 相关的接口与HyperLogLog 相同
            # HyperLogLog能够使用较小且固定的内存空间,单次估算数据集的基数(不同值的数量)
            self.hash = HyperLogLog(
                p=params.get("p", 8),  #  与MinHash 中的数据相比较,num_perm  用于控制精度
                reg=params.get("reg", None),
                hashfunc=params.get("hashfunc", sha1_hash32),  # 内部使用的hash 算法
                hashobj=params.get("hashobj",
                                   None))  # 可选 数组或列表,  使用hashfunc 代替了

        elif self.hash_type in {
                "hyperloglogplusplus", "HyperLogLogPlusPlus", "HyperLogLog++",
                "hyperlogkog++", "HLLPP", "hllpp", "HLL++", "hll++"
        }:
            # 相关的接口与HyperLogLog 相同
            self.hash = HyperLogLogPlusPlus(
                p=params.get("p", 8),
                reg=params.get("reg", None),
                hashfunc=params.get("hashfunc", sha1_hash64),  # 使用的64位的hash 算法
                hashobj=params.get("hashobj", None))

        else:
            raise ValueError("请选择正确的函数函数对象")
Exemple #5
0
# Create an LSH Ensemble index with a threshold
lshensemble = MinHashLSHEnsemble(threshold=0.8, num_perm=128)

# Index takes an iterable of (key, minhash, size)
lshensemble.index([("m2", m2, len(set2)), ("m3", m3, len(set3))])

# Check for membership using the key
print("m2" in lshensemble)
print("m3" in lshensemble)

# Using m1 as the query, get an result iterator
print("Sets with containment > 0.2:")
for key in lshensemble.query(m1, len(set1)):
    print(key)

from datasketch import HyperLogLog, HyperLogLogPlusPlus

data1 = [
    'hyperloglog', 'is', 'a', 'probabilistic', 'data', 'structure', 'for',
    'estimating', 'the', 'cardinality', 'of', 'dataset', 'dataset', 'a'
]

h = HyperLogLogPlusPlus(p=12)
for d in data1:
    h.update(d.encode('utf8'))
print("Estimated cardinality is", h.count())

s1 = set(data1)
print("Actual cardinality is", len(s1))
Exemple #6
0
 def _get_estimator_instance(self) -> object:
     """
     create HyperLogLogPlusPlus cardinality estimator class instance
     :return:
     """
     return HyperLogLogPlusPlus(p=self.hyperloglog_accuracy)
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
"""
it is test of external library
for import hyperloglog
    https://github.com/svpcom/hyperloglog
for from datasketch import HyperLogLogPlusPlus
    https://github.com/ekzhu/datasketch
"""
import os
from sys import path

# import hyperloglog
from datasketch import HyperLogLogPlusPlus

path.insert(
    0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../common')))
from file_operations import read_array_file

if __name__ == '__main__':
    arr = read_array_file('./hyperloglog/data10.txt', True)
    # hll = hyperloglog.HyperLogLog(0.01)  # accept 1% counting error
    hll = HyperLogLogPlusPlus(p=16)
    cnt = len(arr)
    print('count = {}; distinct = {}'.format(cnt, 2103130))
    for i in arr:
        hll.update(str(i).encode('utf8'))
        # hll.add(str(i))
    print(hll.count())
    print(hll.digest([]))
    # print(len(hll))
    _cln_value: 2
})
InputDataSet = pd.DataFrame(inp)

# calc hll
hll_dict = {}
for index, row in InputDataSet.iterrows():
    key = '{}_{}'.format(row[_cln_end_date].strftime("%d/%m/%Y"),
                         row[_cln_host_id])
    print(key)
    if key in hll_dict:
        print('in')
        hll = hll_dict[key][_cln_hll]
    else:
        print('not')
        hll = HyperLogLogPlusPlus(p=12)  # max p=16
        # if row[_cln_hll]: # init from exist hash
        #     _arr= np.fromstring(digest, dtype=int, sep=" ")
        #     hll = HyperLogLogPlusPlus(reg= _arr)
        hll_dict[key] = {
            _cln_hll: hll,
            _cln_end_date: row[_cln_end_date],
            _cln_host_id: row[_cln_host_id]
        }
    hll.update(str(row[_cln_value]).encode('utf8'))

out = []
# prepare output
for key, value in hll_dict.items():
    hll = value[_cln_hll]
    buf = bytearray(hll.bytesize())
Exemple #9
0
class ColumnSketch:
    """A Column Sketch contains a summary of a table column. 

    Args:
        column_name: the extracted column name.
        minhash_size: the number of permutations to use for MinHash.
        minhash_seed: the random seed used by MinHash.
        hyperloglog_p: the precision parameter used by HyperLogLog.
        sample_size: the size of sample to be kept.
        enable_word_vector_data: whether to build word embedding vector for 
            data values -- can be 10x more expensive.
    """
    def __init__(
        self,
        column_name,
        minhash_size=256,
        minhash_seed=43,
        hyperloglog_p=8,
        sample_size=100,
        enable_word_vector_data=False,
        model=WordVectorModel,
    ):
        self._column_name = column_name
        self._sample = set([])
        self._sample_size = sample_size
        self._count = 0
        self._empty_count = 0
        self._oov_count = 0
        self._numeric_count = 0
        self._minhash = MinHash(num_perm=minhash_size,
                                seed=minhash_seed,
                                hashfunc=self._hashfunc32)
        self._hhl = HyperLogLogPlusPlus(p=hyperloglog_p,
                                        hashfunc=self._hashfunc64)
        self._enabled_word_vec_data = enable_word_vector_data
        self._model = model
        self._sum_vector = self._model.get_empty_word_vector()

    def _hashfunc32(self, str_value):
        return farmhash.hash32(str_value)

    def _hashfunc64(self, str_value):
        return farmhash.hash64(str_value)

    @property
    def column_name(self):
        """The extracted column name.
        """
        return self._column_name

    @property
    def sample(self):
        """A sample (non-random) of the data values in the column as a list.
        """
        return list(self._sample)

    @property
    def count(self):
        """The total number of data values (i.e. rows) including
        the empty ones.
        """
        return self._count

    @property
    def empty_count(self):
        """The number of empty data values.
        """
        return self._empty_count

    @property
    def non_empty_count(self):
        """The number of non-empty data values.
        """
        return self._count - self._empty_count

    @property
    def out_of_vocabulary_count(self):
        """The number of data values that are non-empty and outside of
        the language model's vocabulary.
        """
        return self._oov_count

    @property
    def in_vocabulary_count(self):
        """The number of data values that are non-empty and in
        the language model's vocabulary.
        """
        return self._count - self._empty_count - self._oov_count

    @property
    def numeric_count(self):
        """The number of data values that are non-empty and numerical.
        """
        return self._numeric_count

    @property
    def is_numeric(self):
        """Whether the column is numeric, based on if at least 50% of rows
        are numeric.
        """
        if self.non_empty_count == 0:
            return False
        return (float(self._numeric_count) /
                float(self.non_empty_count)) >= 0.5

    @property
    def distinct_count(self):
        """The approximate distinct count made by the HyperLogLog.
        """
        if len(self._sample) < self._sample_size:
            return len(self._sample)
        return max(len(self._sample), self._hhl.count())

    @property
    def word_vector_column_name(self):
        """The word embedding vector of the column name as a list.
        """
        doc = self._model.process(self.column_name)
        vectors = [token.vector for token in doc if token.has_vector]
        if len(vectors) == 0:
            return None
        return list(float(v) for v in np.sum(vectors, axis=0))

    @property
    def word_vector_data(self):
        """The mean word embedding vector of all data values as a list.
        """
        if not self._enabled_word_vec_data:
            return None
        if self.in_vocabulary_count == 0:
            return None
        vector = self._sum_vector / np.float32(self.in_vocabulary_count)
        return list(float(v) for v in vector)

    @property
    def minhash(self):
        """The hash values in the MinHash.
        """
        return list(int(v) for v in self._minhash.digest())

    @property
    def seed(self):
        """The random seed used for MinHash.
        """
        return self._minhash.seed

    @property
    def hyperloglog(self):
        """The register values of the HyperLogLog counter.
        """
        return list(int(v) for v in self._hhl.digest())

    def update(self, value):
        """Add a data value into the sketch.
        """
        # Update counter.
        self._count += 1
        if not isinstance(value, str):
            value = json.dumps(value, sort_keys=True)
        # Clean the value
        value = value.strip().lower()
        # Skip if the value is empty string.
        if len(value) == 0:
            self._empty_count += 1
            return
        if _is_number(value):
            self._numeric_count += 1
        # Add to sample.
        if len(self._sample) < self._sample_size:
            self._sample.add(value)
        # Update the MinHash sketch.
        self._minhash.update(value)
        # Update the HyperLogLog sketch.
        self._hhl.update(value)
        # Skip word vector extraction if not enabled.
        if not self._enabled_word_vec_data:
            return
        # Update the sum of word embeddings.
        vectors = [
            token.vector for token in self._model.process(value)
            if token.has_vector
        ]
        if len(vectors) > 0:
            self._sum_vector += np.sum(vectors, axis=0)
        else:
            self._oov_count += 1