def main(): input_file, output_file = sys.argv[1], sys.argv[4] stream_size, num_of_asks = int(sys.argv[2]), int( sys.argv[3]) #stream size = 100 bx = BlackBox() seq_num = 0 window_size = 100 random.seed(553) user_list = [] with open(output_file, "w") as f: f.write("seqnum,0_id,20_id,40_id,60_id,80_id") for i in range(num_of_asks): stream_users = bx.ask(input_file, stream_size) if seq_num == 0: user_list += stream_users seq_num += stream_size else: for user in stream_users: seq_num += 1 prob = random.randint(0, 100000) % seq_num if prob < window_size: pos = random.randint(0, 100000) % window_size user_list[pos] = user f.write("\n{},{},{},{},{},{}".format(seq_num, user_list[0], user_list[20], user_list[40], user_list[60], user_list[80])) print("{},{},{},{},{},{}".format(seq_num, user_list[0], user_list[20], user_list[40], user_list[60], user_list[80]))
def main(input_file_path, stream_size, num_of_asks, output_file_path): global bloom_filter bx = BlackBox() gt_set = set() fp = 0 output_file = open(output_file_path, "wt") output_file.write("Time,FPR\n") for it in range(num_of_asks): stream_users = bx.ask(input_file_path, stream_size) for s in stream_users: indices = myhashs(s) is_not_present = False for i in indices: if (not bloom_filter[i]): is_not_present = True break if (is_not_present): for i in indices: bloom_filter[i] = True if (not s in gt_set and not is_not_present): fp += 1 gt_set.add(s) output_file.write("{},{}\n".format(it, fp / ((it + 1) * stream_size))) output_file.close() return
def driver(): random.seed(553) reservoir = [0] * 100 sequence_no = 0 num_of_asks = int(argv[3]) stream_size = int(argv[2]) bx = BlackBox() results = [] for _ in range(num_of_asks): stream_users = bx.ask(str(argv[1]), stream_size) for user in stream_users: sequence_no += 1 if sequence_no <= 100: reservoir[sequence_no - 1] = user else: # reservoir full. # choose to keep user with prob 100 / sequence_no. p_keep_user = random.randint(0, 100000) % sequence_no if p_keep_user < 100: # have to keep user. # replace one elt in reservoir with uniform prob. position_to_replace = random.randint(0, 100000) % 100 reservoir[position_to_replace] = user if sequence_no % 100 == 0: results.append( str( str(sequence_no) + "," + reservoir[0] + "," + reservoir[20] + "," + reservoir[40] + "," + reservoir[60] + "," + reservoir[80])) return results
def main(): blackBox = BlackBox() op = "Time,FPR" for i in range(numOfAsks): stream_users = blackBox.ask(inputFile, streamSize) op += bloom_filtering(i, stream_users) writeToFile(op)
def main(): blackBox = BlackBox() op = "Time,Ground Truth,Estimation" for i in range(numOfAsks): stream_users = blackBox.ask(inputFile, streamSize) op += flajolet_martin(i, stream_users) writeToFile(op)
def flajolet_martin(input_file_path, stream_size, num_of_asks, output_file_path): global number_of_hashes bx = BlackBox() output_file = open(output_file_path, "wt") output_file.write("Time,Ground Truth,Estimation\n") predicted_sum = 0 actual_sum = 0 for it in range(num_of_asks): stream_users = bx.ask(input_file_path, stream_size) max_number_of_trainling_zeros = [-sys.maxsize] * number_of_hashes for s in stream_users: hashes = myhashs(s) for i, h in enumerate(hashes): h = format(h, '016b') number_of_trailing_zeros = len(h) - len(h.rstrip('0')) if (number_of_trailing_zeros > max_number_of_trainling_zeros[i]): max_number_of_trainling_zeros[i] = number_of_trailing_zeros count = calculate_count(max_number_of_trainling_zeros) output_file.write("{},{},{}\n".format(it, stream_size, int(count))) predicted_sum += count actual_sum += stream_size print(predicted_sum / actual_sum) output_file.close() return
def main(): input_file, output_file = sys.argv[1], sys.argv[4] stream_size, num_of_asks = int(sys.argv[2]), int(sys.argv[3]) hash_function_num = 16 m = 69997 bx = BlackBox() boom_filter = [0 for _ in range(m)] with open(output_file, "w") as f: f.write("Time,FPR") for time in range(num_of_asks): stream_users = bx.ask(input_file, stream_size) visited_users = set() FP, TN = 0, 0 for user in stream_users: hash_values = myhashs(user) count = 0 for hash_value in hash_values: if boom_filter[hash_value] == 1: count += 1 if user not in visited_users: if count == hash_function_num: FP += 1 else: TN += 1 visited_users.add(user) for hash_value in hash_values: boom_filter[hash_value] = 1 FPR = float(FP / (FP + TN)) f.write("\n{},{}".format(time, FPR))
def driver(): bx = BlackBox() num_of_asks = int(argv[3]) stream_size = int(argv[2]) results = [] for i in range(num_of_asks): stream_users = bx.ask(str(argv[1]), stream_size) ground_truth = set() for user in stream_users: ground_truth.add(user) estimation = flajolet_martin(stream_users) results.append((i, len(ground_truth), int(estimation))) sum_estimations = 0 sum_ground_truth = 0 for i in results: # print(i) sum_ground_truth += i[1] sum_estimations += i[2] print("Final Result = ", i[2] / i[1]) with open(str(argv[4]), "w") as file: file.write("Time,Ground Truth,Estimation") for r in results: file.write("\n" + str(r[0]) + "," + str(r[1]) + "," + str(r[2])) file.close()
def main(): input_file, output_file = sys.argv[1], sys.argv[4] stream_size, num_of_asks = int(sys.argv[2]), int(sys.argv[3]) hash_function_num = 1000 groups_len = 10 hash_functions_per_group = int(hash_function_num / groups_len) bx = BlackBox() with open(output_file, "w") as f: f.write("Time,Ground Truth,Estimation") est_all = 0 gt_all = 0 for time in range(num_of_asks): gt = set() stream_users = bx.ask(input_file, stream_size) all_hash_values = [] for user in stream_users: gt.add(user) hash_values = myhashs(user) all_hash_values.append(hash_values) estimates = [] for i in range(hash_function_num): longest_trailing_zeros = 0 for hash_values in all_hash_values: hash_value = hash_values[i] trailing_zeros = 0 while hash_value & 1 == 0 and hash_value > 0: trailing_zeros += 1 hash_value = hash_value >> 1 longest_trailing_zeros = max(trailing_zeros, longest_trailing_zeros) estimates.append(2**longest_trailing_zeros) estimates_avg = [] for i in range(groups_len): sum_est = 0 for j in range(hash_functions_per_group): sum_est += estimates[i * hash_functions_per_group + j] estimates_avg.append(float(sum_est / hash_functions_per_group)) estimates_avg.sort() estimate = round(estimates_avg[int(groups_len / 2)]) est_all += estimate gt_all += len(gt) f.write("\n{},{},{}".format(time, len(gt), estimate))
def driver(): bx = BlackBox() num_of_asks = int(argv[3]) stream_size = int(argv[2]) fpr = [] bit_array = [0 for _ in range(69997)] previous_users = set() for i in range(num_of_asks): stream_users = bx.ask(str(argv[1]), stream_size) batch_fpr = bloom_filter(bit_array, stream_users, previous_users) fpr.append((i, batch_fpr)) for f in fpr: print(f) with open(str(argv[4]), "w") as file: file.write("Time,FPR") for f in fpr: file.write("\n" + str(f[0]) + "," + str(f[1])) file.close()
output_filename = 'task3_out.csv' elif len(sys.argv) != 5: print('Usage : python task1.py <input_filename> stream_size num_of_asks <output_filename>') exit(1) else: input_filename = sys.argv[1] stream_size = int(sys.argv[2]) num_of_asks = int(sys.argv[3]) output_filename = sys.argv[4] random.seed(553) saved_users = [] bx = BlackBox() n = 0 print('seqnum, 0_id, 20_id, 40_id, 60_id, 80_id\n') with open(output_filename, 'w+') as file: file.write('seqnum, 0_id, 20_id, 40_id, 60_id, 80_id\n') for i in range(num_of_asks): data = bx.ask(input_filename, stream_size) for s in data: n += 1 if n <= 100: saved_users.append(s) elif random.randint(0, 100000) % n < 100: index = random.randint(0, 100000) % 100 saved_users[index] = s print((i+1)*stream_size, saved_users[0], saved_users[20],saved_users[40],saved_users[60],saved_users[80]) file.write(str((i+1)*stream_size) + ',' + str(saved_users[0]) + ',' + str(saved_users[20]) + ',' + str(saved_users[40]) + ',' + str(saved_users[60]) + ',' +str(saved_users[80]) + '\n')
result = [] for num in all_primes: result.append((num * s + 7) % num_rows) return result #We're trying 8 functions per group res_range = [i for i in range(len(all_primes)+1) if i%8==0] avg_vals = [] op_list = [] sum_estimates = 0 sum_ground_truth = 0 for _ in range(30): users = bx.ask(users_fn, 300) max_zeros = [0]*len(all_primes) mean_groups = [] estimates_ask = 0 ground_truth_ask = 0 for user in users: #Let's get the hash value! # Converting it into a numerical value num_val = int(binascii.hexlify(user.encode('utf8')), 16) #Obtaining the hash values result = myhash(num_val) #Obntaining the binary for each hash value result_bin = [bin(i) for i in result] # To find the least significant 1
f.write("\n") f.write( str(num_of_users) + "," + str(memory[10]) + "," + str(memory[30]) + "," + str(memory[50]) + "," + str(memory[70] + "," + str(memory[90]))) print( str(num_of_users) + "," + str(memory[10]) + "," + str(memory[30]) + "," + str(memory[50]) + "," + str(memory[70] + "," + str(memory[90]))) if __name__ == '__main__': file = "users.txt" num = 100 time = 30 output = "Sampling_output.csv" size = 100 memory = [] num_of_users = 0 with open(output, 'w+') as f: f.write("seqnum,10_id,30_id,50_id,70_id,90_id") random.seed(553) bx = BlackBox() for i in range(time): line = bx.ask(file, num) sample(line)
## Compute the keeping probability. probVal = (random.randint(0, 100000) % numElems) ## If we decide to keep the current ID. if (probVal < 100): ## Compute the index to be replaced. replaceIdx = random.randint(0,100000) % 100 ## Add the current element to the list. userIDList[replaceIdx] = userName ## Print Stats. if (numElems != 0 and numElems % 100 == 0): print(numElems, userIDList[0], userIDList[20], userIDList[40], userIDList[60], userIDList[80]) with open(outfilePath, "a") as f: f.write(str(numElems) + ',' + str(userIDList[0]) + ',' + str(userIDList[20]) + ',' + str(userIDList[40]) + ',' + str(userIDList[60]) + ',' + str(userIDList[80]) + '\n') with open(outfilePath, "w") as f: f.write('seqnum,0_id,20_id,40_id,60_id,80_id' + '\n') ## Loop. for i in range(0, numAsks): ## Obtain the userStream. userStream = bxInstance.ask(dataSetPath, streamSize) ## Perform bloom-filtering. reservoirSampling(userStream)
for pair in res: csv_writer.writerow(pair) print('writing finished') if __name__ == '__main__': import sys import time from blackbox import BlackBox start = time.time() file_name = sys.argv[1] stream_size = int(sys.argv[2]) num_ask = int(sys.argv[3]) output_path = sys.argv[4] k = 30 chunks = 10 bx = BlackBox() res = [] hash_func_list = generate_hash_func(k) for _ in range(num_ask): stm = bx.ask(file_name, stream_size) real = len(set(stm)) pred = fm(stm) res.append((_, real, int(pred))) export_file(res, output_path) end = time.time() print(f'finished within {end - start} seconds! ')
for i in range(0, n): j = random.randint(0, 100000) % 100 if j < streamSize: # position within the streamSize found - so keep the element to maintain the probability s/n # already existing jth element in reservoirList discarded and the new element added reservoirList[j] = inputStream[i] sequenceNumber = (sequenceList[len(sequenceList) - 1][0]) + 100 sequenceList.append( (sequenceNumber, reservoirList[0], reservoirList[20], reservoirList[40], reservoirList[60], reservoirList[80])) # simulating the streaming process bx = BlackBox() for _ in range(numOfAsks): streamUsers = bx.ask(dataPath, streamSize) reservoirSample(streamUsers) # write file output with header - "Time,Ground Truth,Estimation" f = open('task3.csv', 'w+') f.write("seqnum,0_id,20_id,40_id,60_id,80_id") for val in sequenceList: f.write("\n") f.write( str(val[0]) + "," + str(val[1]) + "," + str(val[2]) + "," + str(val[3]) + "," + str(val[4]) + "," + str(val[5])) f.close() # duration within 100s for 30 asks print("Duration:", time.time() - start)
def reservoir(seqNum, data): global reservoirList count = seqNum if len(reservoirList) == 0: reservoirList = data count += len(data) else: for d in data: count += 1 prob = random.randint(0, 100000) % count if (prob < streamSize): pos = random.randint(0, 100000) % streamSize reservoirList[pos] = d return str(count) + "," + str(reservoirList[0]) + "," + str( reservoirList[20]) + "," + str(reservoirList[40]) + "," + str( reservoirList[60]) + "," + str(reservoirList[80] + "\n") if __name__ == "__main__": blackBox = BlackBox() random.seed(553) op = "seqnum,0_id,20_id,40_id,60_id,80_id\n" for i in range(numOfAsks): data = blackBox.ask(inputFile, streamSize) op += reservoir(i * streamSize, data) writeToFile(op) end = time.time() print("Duration:" + str(round(end - start, 2)))
global groups global rows global estimation global ground_truth global nextptr hashed_result, user_id = get_hash_fm(input_values) hash_values = compute_zeroes(hashed_result) pred = compute_estimates(hash_values) final_result = pred[int(groups / 2)] with open(output_file, "a+") as f: f.write(str(nextptr) + ',' + str(len(user_id)) + ',' + str(int(final_result)) + '\n') nextptr += 1 estimation += final_result ground_truth += len(user_id) with open(output_file, "w") as f: f.write('Time,Ground Truth,Estimation' + '\n') for i in range(0, num_of_asks): input_vals = bx.ask(input_data, stream_size) Flajolet_Martin_Algorithm(input_vals)
if __name__ == '__main__': start = time.time() input_path = sys.argv[1] stream_size = int(sys.argv[2]) num_of_asks = int(sys.argv[3]) output_path = sys.argv[4] bx = BlackBox() ground_truth = [] estimate_length = [] for _ in range(num_of_asks): stream_users = bx.ask(input_path, stream_size) ground_truth.append(len(set(stream_users))) longest_trail = [0 for _ in range(hash_num)] for user in stream_users: user_hash = myhashs(user) for i, h in enumerate(user_hash): longest_trail[i] = max(longest_trail[i], len(h) - len(h.rstrip('0'))) for i in range(hash_num): longest_trail[i] = 2**longest_trail[i] # print(longest_trail) avg_trail = [] for i in range(hash_group): avg_trail.append(
filter_bit_array[id] = 1 # Using S to populate filter_bit_array actual_stream = [] unique_users_sanity = [] unique_users_algo = [] op_list = [] not_unique = [] for _ in range(num_asks): # Obtain the users # print("This is ask:",_+1) FP_count = 0 TN_count = 0 users = bx.ask(users_fn, stream_size) for user in users: # Just keeping tabs on the actual stream actual_stream.append(user) # Converting it into a numerical value num_val = int(binascii.hexlify(user.encode('utf8')), 16) # Obtaining the hash values result = myhash(num_val) # print(result) # The indices returned by hash functions flag = 0 skip_other_check = 0 for i in result: if filter_bit_array[i] == 0:
user_seq_num += 1 r = random.randint(0, 100000) % user_seq_num if r < stream_size: # selected index_to_replace = random.randint(0, 100000) % stream_size sample[index_to_replace] = user if __name__ == "__main__": start_time = time.time() bx = BlackBox() input_file_name = sys.argv[1] stream_size = int(sys.argv[2]) num_of_asks = int(sys.argv[3]) output_file_name = sys.argv[4] random.seed(553) output_list = [] for _ in range(num_of_asks): stream_users = bx.ask(input_file_name, stream_size) sample_stream(stream_users) output_list.append([ str(user_seq_num), sample[0], sample[20], sample[40], sample[60], sample[80] ]) with open(output_file_name, "w") as f: f.write("seqnum,0_id,20_id,40_id,60_id,80_id\n") for tup in output_list: f.write(",".join(tup) + "\n") print("Duration: ", time.time() - start_time)
true_negatives += 1.0 for i in range(no_of_hashf): filter_bit_Array[now[i]] = 1 traversed.add(u) FPR = calculate_fpr(false_positives, true_negatives) with open(output_file, "a+") as out: out.write(str(nextptr) + ',' + str(FPR) + '\n') nextptr += 1 def run_algorithm(num_of_asks, input_data, stream_size, bx): for i in range(0, num_of_asks): input_par = bx.ask(input_data, stream_size) BloomFiltering(input_par) return with open(output_file, "w") as f: f.write('Time,FPR' + '\n') for i in range(0, num_of_asks): item = bx.ask(input_data, stream_size) BloomFiltering(item)
from blackbox import BlackBox import sys import time start = time.time() file_name = sys.argv[1] stream_size = int(sys.argv[2]) num_ask = int(sys.argv[3]) output_path = sys.argv[4] random.seed(553) bx = BlackBox() data_holder = [] n = 0 res = [] for _ in range(num_ask): stream_users = bx.ask(file_name, stream_size) for user in stream_users: n += 1 if len(data_holder) < 100: data_holder.append(user) else: prob = random.random() if prob < 100 / n: idx = random.randint(0, 99) data_holder[idx] = user res.append((n, data_holder[0], data_holder[20], data_holder[40], data_holder[60], data_holder[80])) export_file(res, output_path) end = time.time() print(f'finished within {end - start} seconds! ')
estimate = (int)(statistics.median((avg_group))) with open(output, "a") as f: f.write(str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) f.write("," + str(ground_truth)) f.write("," + str(estimate)) f.write("\n") print( str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + " " + str(ground_truth) + " " + str(estimate)) if __name__ == '__main__': file = "users.txt" num = 300 times = 30 output = "FlajoletMartin_output.csv" # num of hash_functions k = 15 # groups g = 5 # num in group l = 3 with open(output, 'w+') as f: f.write("Time,Ground Truth,Estimation") f.write("\n") bx = BlackBox() for i in range(times): flajolet(bx.ask(file, num))
# calculate FPR = FP / (FP + TN) false_positive_rate = 0.0 if (false_pos + true_neg == 0) else false_pos / (false_pos + true_neg) with open(output, 'a') as f: f.write(str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) f.write("," + str(float(false_positive_rate))) f.write("\n") print( str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + " " + str(float(false_positive_rate))) if __name__ == '__main__': file = "users.txt" num = 100 times = 30 output = "Bloomfilter_output.csv" filter_array = [0 for i in range(69997)] seen_user = set() with open(output, 'w+') as f: f.write("Time,FPR") f.write("\n") from blackbox import BlackBox bx = BlackBox() for i in range(times): bloom(bx.ask(file, num))
# For each line, the first column is the sequence number (starting from 1) # of the latest user in the entire streaming, then the 1th user (with index 0 in your list), # 21th user, 41th user, 61th user and 81th user in your reservoir. reservoir = [] out_file = sys.argv[4] with open(out_file, 'w') as writeFile: print("In write") writeFile.write("seqnum,0_id,20_id,40_id,60_id,80_id\n") total_count = 0 n = 100 for _ in range(num_asks): # print("This is ask:",_+1) # Obtain the users # print("This is ask:",_+1) users = bx.ask(users_fn, 100) if _ == 0: reservoir.extend(users) #print("This is reservoir:", reservoir) writeFile.write( str(n) + "," + reservoir[0] + "," + reservoir[20] + "," + reservoir[40] + "," + reservoir[60] + "," + reservoir[80] + "\n") continue count = 0 for user in users: n = n + 1 # prob_keep = 100/len(actual_stream) prob_keep = len(reservoir)