def test_hll_counting(self): f = tempfile.NamedTemporaryFile(mode='r+b') test1 = HyperLogLogDB(fileobj=f, error_rate=self.error_rate) test1.add('test_key', 'test_val') f2 = tempfile.NamedTemporaryFile(mode='r+b') mfile = self.init_hll_file(f2) test2 = hll.HyperLogLog(self.error_rate, mfile) test2.add('test_val') self.assertEqual(test1.count('test_key'), 1) self.assertEqual(len(test2), 1)
est = (numpy.log(va0) + numpy.log(vb0) - \ numpy.log(vs1 + va0 + vb0 -1)) / \ numpy.log(1 - (1 / sz)) return est if __name__ == '__main__': minc_size = 1024 hllc_exp_error = 0.02 lpc_size = 4 mc1 = mincount.MinCount(minc_size) mc2 = mincount.MinCount(minc_size) hll1 = hyperloglog.HyperLogLog(hllc_exp_error) hll2 = hyperloglog.HyperLogLog(hllc_exp_error) # just for naive implementation hll_total = hyperloglog.HyperLogLog(hllc_exp_error) lpc1 = lp_counters.LPCounter(lpc_size) lpc2 = lp_counters.LPCounter(lpc_size) for i in range(0, 10000): mc1.add(i) hll1.add(i) hll_total.add(i) lpc1.increment(i) for i in range(8000, 20000): mc2.add(i) hll2.add(i)
if __name__ == '__main__': # input_size = 1000000 # 1 million distinct vehicles input_size = 5000 # initial number of distinct vehicles total_vehicles = 254639386 # total number of possible vehicles lpc_size = 128 * 8192 # linear prob counter in KB size hllc_exp_error = 0.02 # preset error rate for hyperloglog counter minc_size = 1024 # k for min count lpc_error_list = [] hllc_error_list = [] while (input_size <= 10000000): print "---------------------------------" print "Input size: ", input_size lpc = lp_counters.LPCounter(lpc_size) hllc = hyperloglog.HyperLogLog(hllc_exp_error) mc = mincount.MinCount(minc_size) print 'lpc size: ', lpc.get_size() items = set() while len(items) < input_size: i = random.randrange(0, total_vehicles) while i in items: i = random.randrange(0, total_vehicles) items.add(i) lpc.increment(i) hllc.add(i) mc.add(i) lpc_count = lpc.current_count()
def main(): max = 10 hll_hashing = "sha256" changeBias = 0 for i in range(0, 7): for j in range(1, 2): if (max < 100): change_bias = -1.5 hashing = "blake2b" if (max > 100 and max <= 10000): change_bias = 0.5 hashing = "sha256" if (max > 10000): hashing = "sha512" change_bias = 0.1 file_name = str(max) + "data" + str(j) + ".txt" start1 = time.time() f = open(file_name, "r") content = f.read() num = content.split('\n') num.remove('') x1 = countDistinct(num) end1 = time.time() hyLog = hll.HyperLogLog() for n in num: hyLog.add(n) start = time.time() x = hyLog.card() end = time.time() hyLog_mod = HyperLogLog(0.01, hll_hashing, changeBias) for n in num: hyLog_mod.add(n) start_mod = time.time() x_mod = hyLog_mod.card() end_mod = time.time() print(hyLog.p, hyLog.m) print("\n") print("*************************************************") print("File Name - ", file_name) print("Number of Entries - ", max) print("\n") print("Brute Force - ") print("Cardinality: ", x1, "\tTimeTaken: ", (end1 - start1) * 1000) print("\n") print("Original HLL - ") print("Cardinality: ", x, "\tTimeTaken: ", (end - start) * 1000, "\nAccuracy: ", 100 - (abs(x - x1) / x1) * 100) print("\n") print("Modified HLL - ") print("Cardinality: ", x_mod, "\tTimeTaken: ", (end_mod - start_mod) * 1000, "\nAccuracy: ", 100 - (abs(x_mod - x1) / x1) * 100) print("*************************************************") print("\n") max *= 10
log(2 * size_a * exp_error))) * lpc_load_factor hllc_exp_error = exp_error minc_size = int(ceil(96 / (exp_error ** 2))) * \ minc_load_factor # repeat the experiment several times with same setting lpc_a_error = 0 hllc_a_error = 0 mc_a_error = 0 lpc_error = 0 hllc_error = 0 mc_error = 0 for exp_i in range(repeat_cnt): # initiate counters lpc_a = lp_counters.LPCounter(lpc_size) lpc_b = lp_counters.LPCounter(lpc_size) hllc_a = hyperloglog.HyperLogLog(hllc_exp_error) hllc_b = hyperloglog.HyperLogLog(hllc_exp_error) # simplify solution for hll hllc_u = hyperloglog.HyperLogLog(hllc_exp_error) mc_a = mincount.MinCount(minc_size) mc_b = mincount.MinCount(minc_size) # create items common = size_a * js items_a = set() items_b = set() while len(items_a) < size_a: while True: i = random.randrange(0, total_vehicles) if i not in items_a: items_a.add(i) lpc_a.increment(i)