def test_xxh64(self): self.assertEqual(xxhash.xxh64('a').intdigest(), 15154266338359012955) self.assertEqual( xxhash.xxh64('a', 0).intdigest(), 15154266338359012955) self.assertEqual( xxhash.xxh64('a', 1).intdigest(), 16051599287423682246) self.assertEqual( xxhash.xxh64('a', 2**64 - 1).intdigest(), 6972758980737027682)
def test_xxh64_reset(self): x = xxhash.xxh64() h = x.intdigest() for i in range(10, 50): x.update(os.urandom(i)) x.reset() self.assertEqual(h, x.intdigest())
def test_xxh64_copy(self): a = xxhash.xxh64() a.update('xxhash') b = a.copy() self.assertEqual(a.digest(), b.digest()) self.assertEqual(a.intdigest(), b.intdigest()) self.assertEqual(a.hexdigest(), b.hexdigest()) b.update('xxhash') self.assertNotEqual(a.digest(), b.digest()) self.assertNotEqual(a.intdigest(), b.intdigest()) self.assertNotEqual(a.hexdigest(), b.hexdigest()) a.update('xxhash') self.assertEqual(a.digest(), b.digest()) self.assertEqual(a.intdigest(), b.intdigest()) self.assertEqual(a.hexdigest(), b.hexdigest())
def test_xxh64_update(self): x = xxhash.xxh64() x.update('a') self.assertEqual(xxhash.xxh64('a').digest(), x.digest()) self.assertEqual(xxhash.xxh64_digest('a'), x.digest()) x.update('b') self.assertEqual(xxhash.xxh64('ab').digest(), x.digest()) self.assertEqual(xxhash.xxh64_digest('ab'), x.digest()) x.update('c') self.assertEqual(xxhash.xxh64('abc').digest(), x.digest()) self.assertEqual(xxhash.xxh64_digest('abc'), x.digest()) seed = random.randint(0, 2**64) x = xxhash.xxh64(seed=seed) x.update('a') self.assertEqual(xxhash.xxh64('a', seed).digest(), x.digest()) self.assertEqual(xxhash.xxh64_digest('a', seed), x.digest()) x.update('b') self.assertEqual(xxhash.xxh64('ab', seed).digest(), x.digest()) self.assertEqual(xxhash.xxh64_digest('ab', seed), x.digest()) x.update('c') self.assertEqual(xxhash.xxh64('abc', seed).digest(), x.digest()) self.assertEqual(xxhash.xxh64_digest('abc', seed), x.digest())
def test_xxh64_overflow(self): s = 'I want an unsigned 64-bit seed!' a = xxhash.xxh64(s, seed=0) b = xxhash.xxh64(s, seed=2**64) self.assertEqual(a.seed, b.seed) self.assertEqual(a.intdigest(), b.intdigest()) self.assertEqual(a.hexdigest(), b.hexdigest()) self.assertEqual(a.digest(), b.digest()) self.assertEqual(a.intdigest(), xxhash.xxh64_intdigest(s, seed=0)) self.assertEqual(a.intdigest(), xxhash.xxh64_intdigest(s, seed=2**64)) self.assertEqual(a.digest(), xxhash.xxh64_digest(s, seed=0)) self.assertEqual(a.digest(), xxhash.xxh64_digest(s, seed=2**64)) self.assertEqual(a.hexdigest(), xxhash.xxh64_hexdigest(s, seed=0)) self.assertEqual(a.hexdigest(), xxhash.xxh64_hexdigest(s, seed=2**64)) a = xxhash.xxh64(s, seed=1) b = xxhash.xxh64(s, seed=2**64 + 1) self.assertEqual(a.seed, b.seed) self.assertEqual(a.intdigest(), b.intdigest()) self.assertEqual(a.hexdigest(), b.hexdigest()) self.assertEqual(a.digest(), b.digest()) self.assertEqual(a.intdigest(), xxhash.xxh64_intdigest(s, seed=1)) self.assertEqual(a.intdigest(), xxhash.xxh64_intdigest(s, seed=2**64 + 1)) self.assertEqual(a.digest(), xxhash.xxh64_digest(s, seed=1)) self.assertEqual(a.digest(), xxhash.xxh64_digest(s, seed=2**64 + 1)) self.assertEqual(a.hexdigest(), xxhash.xxh64_hexdigest(s, seed=1)) self.assertEqual(a.hexdigest(), xxhash.xxh64_hexdigest(s, seed=2**64 + 1)) a = xxhash.xxh64(s, seed=2**65 - 1) b = xxhash.xxh64(s, seed=2**66 - 1) self.assertEqual(a.seed, b.seed) self.assertEqual(a.intdigest(), b.intdigest()) self.assertEqual(a.hexdigest(), b.hexdigest()) self.assertEqual(a.digest(), b.digest()) self.assertEqual(a.intdigest(), xxhash.xxh64_intdigest(s, seed=2**65 - 1)) self.assertEqual(a.intdigest(), xxhash.xxh64_intdigest(s, seed=2**66 - 1)) self.assertEqual(a.digest(), xxhash.xxh64_digest(s, seed=2**65 - 1)) self.assertEqual(a.digest(), xxhash.xxh64_digest(s, seed=2**66 - 1)) self.assertEqual(a.hexdigest(), xxhash.xxh64_hexdigest(s, seed=2**65 - 1)) self.assertEqual(a.hexdigest(), xxhash.xxh64_hexdigest(s, seed=2**66 - 1))
def sim_run(threadName, seed, cardinalities, r_thr, hll_bits, grouped_hll, num_bits): np.random.seed(seed) start = datetime.now() try: f2 = threadName + "_" + str(hll_bits) + "-" + str( grouped_hll) + "_" + str(num_bits) + ".txt" fr2 = open(f2, 'w') f3 = threadName + "_timing.txt" fr3 = open(f3, 'a') except IndexError: print("Error: Filename ") sys.exit(2) timer_list = [] timer1_list = [] hll_st_list = [] hll_st1_list = [] hll_st2_list = [] gelement_list = [] abs_err_list = [] abs_err1_list = [] abs_err2_list = [] hll_size = 1 << hll_bits for car_thr in cardinalities: gelement = {} hll_items = [] while len(gelement) < car_thr: #hll_item = str(np.random.randint(1, car_thr<<6,1)) hll_item = str(np.random.zipf(1.0001, 1)) gelement[hll_item] = 1 hll_items.append(hll_item) for runs in range(0, r_thr): xseed = seed + (12 + runs) * runs random.shuffle(hll_items) print(threadName, car_thr, runs, hll_bits, grouped_hll, num_bits) #----HLL streaming INIT hll_reg0 = [0] * hll_size q_st = float(hll_size) hll_st = 0 timer0 = start - start #----Debug gelement = {} debug_info = {} debug_info['pkt'] = 0 tot_pkts = 0 #----HLL grouped streaming INIT mask_bits = (1 << num_bits) - 1 num_reg = hll_size / grouped_hll hll_reg2 = [num_bits] * num_reg #hll_reg2=[0]*num_reg hll_vec2 = {} for i in range(0, num_reg): hll_vec2[i] = [0] * grouped_hll hll_st2 = 0 hll_st1 = 0 timer1 = start - start #---- #while len(gelement) < car_thr: for hll_item in hll_items: #hll_item = str(np.random.zipf(zipf_a, 1)) #hll_item = str(np.random.randint(1, car_thr<<6,1)) tot_pkts = tot_pkts + 1 debug_info['pkt'] = debug_info['pkt'] + 1 #Exact counting #Gloobal counting gelement[hll_item] = 1 #HLL streaming hash0 = xxhash.xxh64(hll_item.encode('ascii'), seed=xseed).intdigest() #-----HLL_ADDING_PHASE index0 = hash0 >> (64 - hll_bits) rank0 = hll_rank(hash0, hll_bits) start_0 = datetime.now() if rank0 > hll_reg0[index0]: #streaming update #hll_st += float(hll_size)/q_st #q_st += 1.0/(1<<rank0) -1.0/(1<<hll_reg0[index0]) q_st = 0 for i in range(0, hll_size): q_st += 1.0 / (1 << hll_reg0[i]) hll_st += float(hll_size) / q_st # reg update hll_reg0[index0] = rank0 end_0 = datetime.now() timer0 += end_0 - start_0 #HLL grouped streaming #hash2 = xxhash.xxh64(hll_item.encode('ascii')).intdigest() hash2 = hash0 #-----HLL_ADDING_PHASE index2 = hash2 >> (64 - hll_bits) reg_index = index2 / grouped_hll vec_index = index2 % grouped_hll rank2 = hll_rank(hash2, hll_bits) start_1 = datetime.now() #print(q_st2, hll_st2, rank2, reg_index,vec_index, hll_vec2[reg_index], hll_reg2[reg_index]) if rank2 > hll_reg2[reg_index]: #streaming prob to modify sketch q_st2 = 0 for i in range(0, num_reg): q_st2 += 4.0 / (1 << (hll_reg2[i])) for j in range(0, grouped_hll): for k in range(0, num_bits): q_st2 += ( ((~hll_vec2[i][j] >> k) & 1) + 0.0) / (1 << (hll_reg2[i] - num_bits + 1 + k)) # reg update shift = rank2 - hll_reg2[reg_index] hll_reg2[reg_index] = rank2 for j in range(0, grouped_hll): hll_vec2[reg_index][j] = ( hll_vec2[reg_index][j] >> shift) & mask_bits hll_vec2[reg_index][vec_index] |= ( 1 << (num_bits - 1)) & mask_bits hll_st2 += float(hll_size) / q_st2 elif hll_reg2[reg_index] - rank2 < num_bits: if ((hll_vec2[reg_index][vec_index] >> (num_bits - 1 - hll_reg2[reg_index] + rank2)) & 1) == 0: #streaming prob to modify sketch q_st2 = 0 for i in range(0, num_reg): q_st2 += 4.0 / (1 << (hll_reg2[i])) for j in range(0, grouped_hll): for k in range(0, num_bits): q_st2 += (( (~hll_vec2[i][j] >> k) & 1) + 0.0) / ( 1 << (hll_reg2[i] - num_bits + 1 + k)) #vec update hll_st2 += float(hll_size) / q_st2 hll_vec2[reg_index][vec_index] |= ( 1 << (num_bits - 1 - hll_reg2[reg_index] + rank2)) & mask_bits #print(q_st2, hll_st2, rank2, reg_index,vec_index, hll_vec2[reg_index], hll_reg2[reg_index]) end_1 = datetime.now() timer1 += end_1 - start_1 #streaming prob to modify sketch - estimation at the end of the data stream q_st2 = 0 for i in range(0, num_reg): q_st2 += 4.0 / (1 << (hll_reg2[i])) for j in range(0, grouped_hll): for k in range(0, num_bits): q_st2 += (((~hll_vec2[i][j] >> k) & 1) + 0.0) / (1 << (hll_reg2[i] - num_bits + 1 + k)) hll_st1 = q_st2 / float(hll_size) fr2.write( str(runs) + "\t" + str(round(hll_st, 3)) + "\t" + str(round(q_st / float(hll_size), 6)) + "\t" + str(round(hll_st2, 3)) + "\t" + str(round(hll_st1, 6)) + "\t" + str(len(gelement)) + "\t" + str(tot_pkts) + "\t" + str(timer0.total_seconds()) + "\t" + str(timer1.total_seconds()) + "\n") print(threadName, runs, hll_st, q_st / float(hll_size), hll_st2, hll_st1, len(gelement), tot_pkts, (timer0.total_seconds()), (timer1.total_seconds())) hll_st_list.append(hll_st) hll_st1_list.append(hll_st1) hll_st2_list.append(hll_st2) gelement_list.append(len(gelement)) abs_err_list.append(abs(hll_st - len(gelement))) abs_err1_list.append(abs(hll_st1 - len(gelement))) abs_err2_list.append(abs(hll_st2 - len(gelement))) timer_list.append(timer0) timer1_list.append(timer1) fr3.write("------------------------------------------\n") now = datetime.now() fr3.write(threadName + '-command line: ' + ' '.join(sys.argv) + '\n') fr3.write('simulation time: ' + str(now - start) + '\n') fr3.write("==========================================\n") fr3.write( str(np.mean(hll_st_list)) + " " + str(np.mean(hll_st2_list)) + " " + str(np.mean(gelement_list)) + " - " + str(np.mean(abs_err_list)) + " " + str(np.mean(abs_err2_list)) + "\n") fr3.write( str(np.std(hll_st_list)) + " " + str(np.std(hll_st2_list)) + " " + str(np.std(gelement_list)) + " - " + str(np.std(abs_err_list)) + " " + str(np.std(abs_err2_list)) + "\n") fr2.close() fr3.close()
def sim_run(threadName, seed, hll_bits, grouped_hll, num_bits, filenames): parsing_mode = 4 start = datetime.now() np.random.seed(seed) try: f2 = threadName + "_" + str(hll_bits) + "-" + str( grouped_hll) + "_" + str(num_bits) + ".txt" fr2 = open(f2, 'w') f3 = threadName + "_timing.txt" fr3 = open(f3, 'a') except IndexError: print("Error: Filename ") sys.exit(2) hll_st_list = [] hll_st1_list = [] hll_st2_list = [] gelement_list = [] abs_err_list = [] abs_err1_list = [] abs_err2_list = [] hll_size = 1 << hll_bits runis = 0 for filename in filenames: runis = runis + 1 for runs in range(0, 10): xseed = seed + (12 + runs) * (runs + runis) try: fr1 = open(filename, 'r') except IndexError: print("Error: Filename ") sys.exit(2) print(threadName, filename, runs, hll_bits, grouped_hll, num_bits) pcap = dpkt.pcap.Reader(fr1) #----HLL streaming INIT hll_reg0 = [0] * hll_size q_st = float(hll_size) hll_st = 0 timer0 = start - start #----Debug gelement = {} debug_info = {} debug_info['pkt'] = 0 tot_pkts = 0 #----HLL grouped streaming INIT mask_bits = (1 << num_bits) - 1 num_reg = hll_size / grouped_hll hll_reg2 = [num_bits] * num_reg #hll_reg2=[0]*num_reg hll_vec2 = {} for i in range(0, num_reg): hll_vec2[i] = [0] * grouped_hll hll_st2 = 0 hll_st1 = 0 timer1 = start - start #---- for ts, buf in pcap: verbose = False try: if pcap.datalink() == dpkt.pcap.DLT_LINUX_SLL: eth = dpkt.sll.SLL(raw_pkt) else: eth = dpkt.ethernet.Ethernet(buf) except NotImplementedError: verbose = True print("Not Implemented for pkt: " + str(tot_pkts)) #eth=dpkt.ethernet.Ethernet(buf) if eth.type != dpkt.ethernet.ETH_TYPE_IP: continue ip = eth.data sport = 0 dport = 0 #Packet Parsing src = socket.inet_ntop(socket.AF_INET, ip.src) dst = socket.inet_ntop(socket.AF_INET, ip.dst) try: if (ip.p == dpkt.ip.IP_PROTO_TCP) and (len(ip) > 24): tcp = ip.data sport = tcp.sport dport = tcp.dport elif (ip.p == dpkt.ip.IP_PROTO_UDP) and (len(ip) > 24): udp = ip.data sport = udp.sport dport = udp.dport else: sport = 0 dport = 0 except AttributeError: verbose = False #verbose=True #print(src, dst, ip.p, len(ip), len(ip.data), len(eth)) # function mode (<flow key , discriminator key>) # 1 <src_ip , dst_ip> # 2 <dst_ip , src_ip> # 3 <src_ip , 5-tuple> # 4 <dst_ip , 5-tuple> if (parsing_mode == 2): hll_item = dst elif (parsing_mode == 3): hll_item = src + " " + dst elif (parsing_mode == 4): hll_item = src + " " + dst + " " + str(sport) + " " + str( dport) else: hll_item = src tot_pkts = tot_pkts + 1 debug_info['pkt'] = debug_info['pkt'] + 1 #Exact counting #Gloobal counting gelement[hll_item] = 1 #HLL streaming hash0 = xxhash.xxh64(hll_item.encode('ascii'), seed=xseed).intdigest() #-----HLL_ADDING_PHASE index0 = hash0 >> (64 - hll_bits) rank0 = hll_rank(hash0, hll_bits) start_0 = datetime.now() if rank0 > hll_reg0[index0]: #streaming update #hll_st += float(hll_size)/q_st #q_st += 1.0/(1<<rank0) -1.0/(1<<hll_reg0[index0]) q_st = 0 for i in range(0, hll_size): q_st += 1.0 / (1 << hll_reg0[i]) hll_st += float(hll_size) / q_st # reg update hll_reg0[index0] = rank0 end_0 = datetime.now() timer0 += end_0 - start_0 #HLL grouped streaming #hash2 = xxhash.xxh64(hll_item.encode('ascii')).intdigest() hash2 = hash0 #-----HLL_ADDING_PHASE index2 = hash2 >> (64 - hll_bits) reg_index = index2 / grouped_hll vec_index = index2 % grouped_hll rank2 = hll_rank(hash2, hll_bits) start_1 = datetime.now() #print(q_st2, hll_st2, rank2, reg_index,vec_index, hll_vec2[reg_index], hll_reg2[reg_index]) if rank2 > hll_reg2[reg_index]: #streaming prob to modify sketch q_st2 = 0 for i in range(0, num_reg): q_st2 += 4.0 / (1 << (hll_reg2[i])) for j in range(0, grouped_hll): for k in range(0, num_bits): q_st2 += ( ((~hll_vec2[i][j] >> k) & 1) + 0.0) / (1 << (hll_reg2[i] - num_bits + 1 + k)) # reg update shift = rank2 - hll_reg2[reg_index] hll_reg2[reg_index] = rank2 for j in range(0, grouped_hll): hll_vec2[reg_index][j] = ( hll_vec2[reg_index][j] >> shift) & mask_bits hll_vec2[reg_index][vec_index] |= ( 1 << (num_bits - 1)) & mask_bits hll_st2 += float(hll_size) / q_st2 elif hll_reg2[reg_index] - rank2 < num_bits: if ((hll_vec2[reg_index][vec_index] >> (num_bits - 1 - hll_reg2[reg_index] + rank2)) & 1) == 0: #streaming prob to modify sketch q_st2 = 0 for i in range(0, num_reg): q_st2 += 4.0 / (1 << (hll_reg2[i])) for j in range(0, grouped_hll): for k in range(0, num_bits): q_st2 += (( (~hll_vec2[i][j] >> k) & 1) + 0.0) / ( 1 << (hll_reg2[i] - num_bits + 1 + k)) #vec update hll_st2 += float(hll_size) / q_st2 hll_vec2[reg_index][vec_index] |= ( 1 << (num_bits - 1 - hll_reg2[reg_index] + rank2)) & mask_bits end_1 = datetime.now() timer1 += end_1 - start_1 #print(q_st2, hll_st2, rank2, reg_index,vec_index, hll_vec2[reg_index], hll_reg2[reg_index]) #streaming prob to modify sketch - estimation at the end of the data stream q_st2 = 0 for i in range(0, num_reg): q_st2 += 4.0 / (1 << (hll_reg2[i])) for j in range(0, grouped_hll): for k in range(0, num_bits): q_st2 += (((~hll_vec2[i][j] >> k) & 1) + 0.0) / (1 << (hll_reg2[i] - num_bits + 1 + k)) hll_st1 = q_st2 / float(hll_size) fr2.write( str(runs) + "\t" + str(round(hll_st, 3)) + "\t" + str(round(q_st / float(hll_size), 6)) + "\t" + str(round(hll_st2, 3)) + "\t" + str(round(hll_st1, 6)) + "\t" + str(len(gelement)) + "\t" + str(tot_pkts) + "\t" + filename + "\t" + str(timer0.total_seconds()) + "\t" + str(timer1.total_seconds()) + "\n") print(threadName, runs, hll_st, q_st / float(hll_size), hll_st2, hll_st1, len(gelement), tot_pkts, filename, (timer0.total_seconds()), (timer1.total_seconds())) hll_st_list.append(hll_st) hll_st1_list.append(hll_st1) hll_st2_list.append(hll_st2) gelement_list.append(len(gelement)) abs_err_list.append(abs(hll_st - len(gelement))) abs_err1_list.append(abs(hll_st1 - len(gelement))) abs_err2_list.append(abs(hll_st2 - len(gelement))) fr3.write("------------------------------------------\n") now = datetime.now() fr3.write(threadName + '-command line: ' + ' '.join(sys.argv) + '\n') fr3.write('simulation time: ' + str(now - start) + '\n') fr3.write("==========================================\n") fr3.write( str(np.mean(hll_st_list)) + " " + str(np.mean(hll_st2_list)) + " " + str(np.mean(gelement_list)) + " - " + str(np.mean(abs_err_list)) + " " + str(np.mean(abs_err2_list)) + "\n") fr3.write( str(np.std(hll_st_list)) + " " + str(np.std(hll_st2_list)) + " " + str(np.std(gelement_list)) + " - " + str(np.std(abs_err_list)) + " " + str(np.std(abs_err2_list)) + "\n") fr2.close() fr3.close()