def main():
    input_file, output_file = sys.argv[1], sys.argv[4]
    stream_size, num_of_asks = int(sys.argv[2]), int(
        sys.argv[3])  #stream size = 100

    bx = BlackBox()
    seq_num = 0
    window_size = 100
    random.seed(553)
    user_list = []
    with open(output_file, "w") as f:
        f.write("seqnum,0_id,20_id,40_id,60_id,80_id")
        for i in range(num_of_asks):
            stream_users = bx.ask(input_file, stream_size)
            if seq_num == 0:
                user_list += stream_users
                seq_num += stream_size
            else:
                for user in stream_users:
                    seq_num += 1
                    prob = random.randint(0, 100000) % seq_num
                    if prob < window_size:
                        pos = random.randint(0, 100000) % window_size
                        user_list[pos] = user
            f.write("\n{},{},{},{},{},{}".format(seq_num, user_list[0],
                                                 user_list[20], user_list[40],
                                                 user_list[60], user_list[80]))
            print("{},{},{},{},{},{}".format(seq_num, user_list[0],
                                             user_list[20], user_list[40],
                                             user_list[60], user_list[80]))
Ejemplo n.º 2
0
def main(input_file_path, stream_size, num_of_asks, output_file_path):
    global bloom_filter
    bx = BlackBox()
    gt_set = set()
    fp = 0
    output_file = open(output_file_path, "wt")
    output_file.write("Time,FPR\n")
    for it in range(num_of_asks):
        stream_users = bx.ask(input_file_path, stream_size)
        for s in stream_users:
            indices = myhashs(s)
            is_not_present = False

            for i in indices:
                if (not bloom_filter[i]):
                    is_not_present = True
                    break

            if (is_not_present):
                for i in indices:
                    bloom_filter[i] = True

            if (not s in gt_set and not is_not_present):
                fp += 1

            gt_set.add(s)
        output_file.write("{},{}\n".format(it, fp / ((it + 1) * stream_size)))
    output_file.close()
    return
Ejemplo n.º 3
0
def driver():
    random.seed(553)
    reservoir = [0] * 100
    sequence_no = 0
    num_of_asks = int(argv[3])
    stream_size = int(argv[2])
    bx = BlackBox()
    results = []

    for _ in range(num_of_asks):
        stream_users = bx.ask(str(argv[1]), stream_size)
        for user in stream_users:
            sequence_no += 1
            if sequence_no <= 100:
                reservoir[sequence_no - 1] = user
            else:
                # reservoir full.
                # choose to keep user with prob 100 / sequence_no.
                p_keep_user = random.randint(0, 100000) % sequence_no
                if p_keep_user < 100:
                    # have to keep user.
                    # replace one elt in reservoir with uniform prob.
                    position_to_replace = random.randint(0, 100000) % 100
                    reservoir[position_to_replace] = user

            if sequence_no % 100 == 0:
                results.append(
                    str(
                        str(sequence_no) + "," + reservoir[0] + "," +
                        reservoir[20] + "," + reservoir[40] + "," +
                        reservoir[60] + "," + reservoir[80]))

    return results
Ejemplo n.º 4
0
def main():
    blackBox = BlackBox()
    op = "Time,FPR"
    for i in range(numOfAsks):
        stream_users = blackBox.ask(inputFile, streamSize)
        op += bloom_filtering(i, stream_users)
    writeToFile(op)
Ejemplo n.º 5
0
def main():
    blackBox = BlackBox()
    op = "Time,Ground Truth,Estimation"
    for i in range(numOfAsks):
        stream_users = blackBox.ask(inputFile, streamSize)
        op += flajolet_martin(i, stream_users)
    writeToFile(op)
Ejemplo n.º 6
0
def flajolet_martin(input_file_path, stream_size, num_of_asks,
                    output_file_path):
    global number_of_hashes
    bx = BlackBox()
    output_file = open(output_file_path, "wt")
    output_file.write("Time,Ground Truth,Estimation\n")
    predicted_sum = 0
    actual_sum = 0
    for it in range(num_of_asks):
        stream_users = bx.ask(input_file_path, stream_size)
        max_number_of_trainling_zeros = [-sys.maxsize] * number_of_hashes
        for s in stream_users:
            hashes = myhashs(s)
            for i, h in enumerate(hashes):
                h = format(h, '016b')
                number_of_trailing_zeros = len(h) - len(h.rstrip('0'))

                if (number_of_trailing_zeros >
                        max_number_of_trainling_zeros[i]):
                    max_number_of_trainling_zeros[i] = number_of_trailing_zeros
        count = calculate_count(max_number_of_trainling_zeros)
        output_file.write("{},{},{}\n".format(it, stream_size, int(count)))
        predicted_sum += count
        actual_sum += stream_size
    print(predicted_sum / actual_sum)
    output_file.close()
    return
Ejemplo n.º 7
0
def main():
    input_file, output_file = sys.argv[1], sys.argv[4]
    stream_size, num_of_asks = int(sys.argv[2]), int(sys.argv[3])

    hash_function_num = 16
    m = 69997

    bx = BlackBox()
    boom_filter = [0 for _ in range(m)]
    with open(output_file, "w") as f:
        f.write("Time,FPR")
        for time in range(num_of_asks):
            stream_users = bx.ask(input_file, stream_size)
            visited_users = set()
            FP, TN = 0, 0
            for user in stream_users:
                hash_values = myhashs(user)
                count = 0
                for hash_value in hash_values:
                    if boom_filter[hash_value] == 1:
                        count += 1

                if user not in visited_users:
                    if count == hash_function_num:
                        FP += 1
                    else:
                        TN += 1
                    visited_users.add(user)

                for hash_value in hash_values:
                    boom_filter[hash_value] = 1
            FPR = float(FP / (FP + TN))
            f.write("\n{},{}".format(time, FPR))
Ejemplo n.º 8
0
def driver():
    bx = BlackBox()
    num_of_asks = int(argv[3])
    stream_size = int(argv[2])
    results = []

    for i in range(num_of_asks):
        stream_users = bx.ask(str(argv[1]), stream_size)
        ground_truth = set()
        for user in stream_users:
            ground_truth.add(user)
        estimation = flajolet_martin(stream_users)
        results.append((i, len(ground_truth), int(estimation)))

    sum_estimations = 0
    sum_ground_truth = 0
    for i in results:
        # print(i)
        sum_ground_truth += i[1]
        sum_estimations += i[2]

    print("Final Result = ", i[2] / i[1])

    with open(str(argv[4]), "w") as file:
        file.write("Time,Ground Truth,Estimation")
        for r in results:
            file.write("\n" + str(r[0]) + "," + str(r[1]) + "," + str(r[2]))
        file.close()
def main():
    input_file, output_file = sys.argv[1], sys.argv[4]
    stream_size, num_of_asks = int(sys.argv[2]), int(sys.argv[3])

    hash_function_num = 1000
    groups_len = 10
    hash_functions_per_group = int(hash_function_num / groups_len)

    bx = BlackBox()

    with open(output_file, "w") as f:
        f.write("Time,Ground Truth,Estimation")
        est_all = 0
        gt_all = 0
        for time in range(num_of_asks):
            gt = set()
            stream_users = bx.ask(input_file, stream_size)
            all_hash_values = []
            for user in stream_users:
                gt.add(user)
                hash_values = myhashs(user)
                all_hash_values.append(hash_values)

            estimates = []
            for i in range(hash_function_num):
                longest_trailing_zeros = 0
                for hash_values in all_hash_values:
                    hash_value = hash_values[i]
                    trailing_zeros = 0
                    while hash_value & 1 == 0 and hash_value > 0:
                        trailing_zeros += 1
                        hash_value = hash_value >> 1
                    longest_trailing_zeros = max(trailing_zeros,
                                                 longest_trailing_zeros)
                estimates.append(2**longest_trailing_zeros)

            estimates_avg = []
            for i in range(groups_len):
                sum_est = 0
                for j in range(hash_functions_per_group):
                    sum_est += estimates[i * hash_functions_per_group + j]
                estimates_avg.append(float(sum_est / hash_functions_per_group))
            estimates_avg.sort()
            estimate = round(estimates_avg[int(groups_len / 2)])

            est_all += estimate
            gt_all += len(gt)
            f.write("\n{},{},{}".format(time, len(gt), estimate))
Ejemplo n.º 10
0
def driver():
    bx = BlackBox()
    num_of_asks = int(argv[3])
    stream_size = int(argv[2])
    fpr = []
    bit_array = [0 for _ in range(69997)]
    previous_users = set()

    for i in range(num_of_asks):
        stream_users = bx.ask(str(argv[1]), stream_size)
        batch_fpr = bloom_filter(bit_array, stream_users, previous_users)
        fpr.append((i, batch_fpr))

    for f in fpr:
        print(f)

    with open(str(argv[4]), "w") as file:
        file.write("Time,FPR")
        for f in fpr:
            file.write("\n" + str(f[0]) + "," + str(f[1]))
        file.close()
Ejemplo n.º 11
0
        output_filename = 'task3_out.csv'
    elif len(sys.argv) != 5:
        print('Usage : python task1.py <input_filename> stream_size num_of_asks <output_filename>')
        exit(1)
    else:
        input_filename = sys.argv[1]
        stream_size = int(sys.argv[2])
        num_of_asks = int(sys.argv[3])
        output_filename = sys.argv[4]
    
    random.seed(553)
    
    saved_users = []
    bx = BlackBox()
    n = 0
    print('seqnum, 0_id, 20_id, 40_id, 60_id, 80_id\n')
    
    with open(output_filename, 'w+') as file:
        file.write('seqnum, 0_id, 20_id, 40_id, 60_id, 80_id\n')
        for i in range(num_of_asks):
            data = bx.ask(input_filename, stream_size)
            for s in data:
                n += 1
                if n <= 100:
                    saved_users.append(s)
                elif random.randint(0, 100000) % n < 100:
                    index = random.randint(0, 100000) % 100
                    saved_users[index] = s
            print((i+1)*stream_size, saved_users[0], saved_users[20],saved_users[40],saved_users[60],saved_users[80])
            file.write(str((i+1)*stream_size) + ',' + str(saved_users[0]) + ',' + str(saved_users[20]) + ',' + str(saved_users[40]) + ',' + str(saved_users[60]) + ',' +str(saved_users[80]) + '\n')            
Ejemplo n.º 12
0
    result = []
    for num in all_primes:
        result.append((num * s + 7) % num_rows)

    return result

#We're trying 8 functions per group
res_range = [i  for i in range(len(all_primes)+1) if i%8==0]
avg_vals = []
op_list = []

sum_estimates = 0
sum_ground_truth = 0

for _ in range(30):
    users = bx.ask(users_fn, 300)
    max_zeros = [0]*len(all_primes)
    mean_groups = []
    estimates_ask = 0
    ground_truth_ask = 0
    for user in users:
        #Let's get the hash value!
        # Converting it into a numerical value
        num_val = int(binascii.hexlify(user.encode('utf8')), 16)

        #Obtaining the hash values
        result = myhash(num_val)
        #Obntaining the binary for each hash value
        result_bin = [bin(i) for i in result]

        # To find the least significant 1
Ejemplo n.º 13
0
                f.write("\n")
                f.write(
                    str(num_of_users) + "," + str(memory[10]) + "," +
                    str(memory[30]) + "," + str(memory[50]) + "," +
                    str(memory[70] + "," + str(memory[90])))
            print(
                str(num_of_users) + "," + str(memory[10]) + "," +
                str(memory[30]) + "," + str(memory[50]) + "," +
                str(memory[70] + "," + str(memory[90])))


if __name__ == '__main__':

    file = "users.txt"
    num = 100
    time = 30
    output = "Sampling_output.csv"

    size = 100
    memory = []
    num_of_users = 0

    with open(output, 'w+') as f:
        f.write("seqnum,10_id,30_id,50_id,70_id,90_id")
    random.seed(553)

    bx = BlackBox()
    for i in range(time):
        line = bx.ask(file, num)
        sample(line)
Ejemplo n.º 14
0
			## Compute the keeping probability.
			probVal = (random.randint(0, 100000) % numElems)
			
			## If we decide to keep the current ID.
			if (probVal < 100):

				## Compute the index to be replaced.
				replaceIdx = random.randint(0,100000) % 100
			
				## Add the current element to the list.
				userIDList[replaceIdx] = userName

	## Print Stats.
	if (numElems != 0 and numElems % 100 == 0):
		print(numElems, userIDList[0], userIDList[20], userIDList[40], userIDList[60], userIDList[80])

		with open(outfilePath, "a") as f:
			f.write(str(numElems) + ',' + str(userIDList[0]) + ',' + str(userIDList[20]) + ',' + str(userIDList[40]) + ',' + str(userIDList[60]) + ',' + str(userIDList[80]) + '\n')


with open(outfilePath, "w") as f:
	f.write('seqnum,0_id,20_id,40_id,60_id,80_id' + '\n')

## Loop.
for i in range(0, numAsks):

	## Obtain the userStream.
	userStream = bxInstance.ask(dataSetPath, streamSize)

	## Perform bloom-filtering.
	reservoirSampling(userStream)
Ejemplo n.º 15
0
        for pair in res:
            csv_writer.writerow(pair)
        print('writing finished')


if __name__ == '__main__':
    import sys
    import time
    from blackbox import BlackBox
    start = time.time()
    file_name = sys.argv[1]
    stream_size = int(sys.argv[2])
    num_ask = int(sys.argv[3])
    output_path = sys.argv[4]

    k = 30
    chunks = 10
    bx = BlackBox()
    res = []

    hash_func_list = generate_hash_func(k)
    for _ in range(num_ask):
        stm = bx.ask(file_name, stream_size)
        real = len(set(stm))
        pred = fm(stm)
        res.append((_, real, int(pred)))

    export_file(res, output_path)
    end = time.time()
    print(f'finished within {end - start} seconds! ')
Ejemplo n.º 16
0
        for i in range(0, n):
            j = random.randint(0, 100000) % 100
            if j < streamSize:
                # position within the streamSize found - so keep the element to maintain the probability s/n
                # already existing jth element in reservoirList discarded and the new element added
                reservoirList[j] = inputStream[i]

        sequenceNumber = (sequenceList[len(sequenceList) - 1][0]) + 100
        sequenceList.append(
            (sequenceNumber, reservoirList[0], reservoirList[20],
             reservoirList[40], reservoirList[60], reservoirList[80]))


# simulating the streaming process
bx = BlackBox()
for _ in range(numOfAsks):
    streamUsers = bx.ask(dataPath, streamSize)
    reservoirSample(streamUsers)

# write file output with header - "Time,Ground Truth,Estimation"
f = open('task3.csv', 'w+')
f.write("seqnum,0_id,20_id,40_id,60_id,80_id")
for val in sequenceList:
    f.write("\n")
    f.write(
        str(val[0]) + "," + str(val[1]) + "," + str(val[2]) + "," +
        str(val[3]) + "," + str(val[4]) + "," + str(val[5]))
f.close()

# duration within 100s for 30 asks
print("Duration:", time.time() - start)
Ejemplo n.º 17
0

def reservoir(seqNum, data):
    global reservoirList
    count = seqNum
    if len(reservoirList) == 0:
        reservoirList = data
        count += len(data)
    else:
        for d in data:
            count += 1
            prob = random.randint(0, 100000) % count
            if (prob < streamSize):
                pos = random.randint(0, 100000) % streamSize
                reservoirList[pos] = d
    return str(count) + "," + str(reservoirList[0]) + "," + str(
        reservoirList[20]) + "," + str(reservoirList[40]) + "," + str(
            reservoirList[60]) + "," + str(reservoirList[80] + "\n")


if __name__ == "__main__":
    blackBox = BlackBox()
    random.seed(553)
    op = "seqnum,0_id,20_id,40_id,60_id,80_id\n"
    for i in range(numOfAsks):
        data = blackBox.ask(inputFile, streamSize)
        op += reservoir(i * streamSize, data)
    writeToFile(op)
    end = time.time()
    print("Duration:" + str(round(end - start, 2)))
Ejemplo n.º 18
0
    global groups
    global rows
    global estimation
    global ground_truth
    global nextptr
    hashed_result, user_id = get_hash_fm(input_values)
    hash_values = compute_zeroes(hashed_result)
    pred = compute_estimates(hash_values)


    final_result = pred[int(groups / 2)]

    with open(output_file, "a+") as f:
        f.write(str(nextptr) + ',' + str(len(user_id)) + ',' + str(int(final_result)) + '\n')

    nextptr += 1
    estimation += final_result
    ground_truth += len(user_id)

with open(output_file, "w") as f:
	f.write('Time,Ground Truth,Estimation' + '\n')

for i in range(0, num_of_asks):
	input_vals = bx.ask(input_data, stream_size)
	Flajolet_Martin_Algorithm(input_vals)




Ejemplo n.º 19
0
if __name__ == '__main__':

    start = time.time()

    input_path = sys.argv[1]
    stream_size = int(sys.argv[2])
    num_of_asks = int(sys.argv[3])
    output_path = sys.argv[4]

    bx = BlackBox()
    ground_truth = []
    estimate_length = []

    for _ in range(num_of_asks):
        stream_users = bx.ask(input_path, stream_size)
        ground_truth.append(len(set(stream_users)))
        longest_trail = [0 for _ in range(hash_num)]
        for user in stream_users:
            user_hash = myhashs(user)
            for i, h in enumerate(user_hash):
                longest_trail[i] = max(longest_trail[i],
                                       len(h) - len(h.rstrip('0')))

        for i in range(hash_num):
            longest_trail[i] = 2**longest_trail[i]
        # print(longest_trail)

        avg_trail = []
        for i in range(hash_group):
            avg_trail.append(
Ejemplo n.º 20
0
        filter_bit_array[id] = 1


# Using S to populate filter_bit_array
actual_stream = []
unique_users_sanity = []
unique_users_algo = []
op_list = []

not_unique = []
for _ in range(num_asks):
    # Obtain the users
    # print("This is ask:",_+1)
    FP_count = 0
    TN_count = 0
    users = bx.ask(users_fn, stream_size)
    for user in users:
        # Just keeping tabs on the actual stream
        actual_stream.append(user)

        # Converting it into a numerical value
        num_val = int(binascii.hexlify(user.encode('utf8')), 16)

        # Obtaining the hash values
        result = myhash(num_val)
        # print(result)
        # The indices returned by hash functions
        flag = 0
        skip_other_check = 0
        for i in result:
            if filter_bit_array[i] == 0:
Ejemplo n.º 21
0
            user_seq_num += 1
            r = random.randint(0, 100000) % user_seq_num
            if r < stream_size:  # selected
                index_to_replace = random.randint(0, 100000) % stream_size
                sample[index_to_replace] = user


if __name__ == "__main__":
    start_time = time.time()
    bx = BlackBox()
    input_file_name = sys.argv[1]
    stream_size = int(sys.argv[2])
    num_of_asks = int(sys.argv[3])
    output_file_name = sys.argv[4]
    random.seed(553)

    output_list = []
    for _ in range(num_of_asks):
        stream_users = bx.ask(input_file_name, stream_size)
        sample_stream(stream_users)
        output_list.append([
            str(user_seq_num), sample[0], sample[20], sample[40], sample[60],
            sample[80]
        ])

    with open(output_file_name, "w") as f:
        f.write("seqnum,0_id,20_id,40_id,60_id,80_id\n")
        for tup in output_list:
            f.write(",".join(tup) + "\n")
    print("Duration: ", time.time() - start_time)
Ejemplo n.º 22
0
                true_negatives += 1.0

        for i in range(no_of_hashf):
            filter_bit_Array[now[i]] = 1

        traversed.add(u)

    FPR = calculate_fpr(false_positives, true_negatives)

    with open(output_file, "a+") as out:
        out.write(str(nextptr) + ',' + str(FPR) + '\n')

    nextptr += 1


def run_algorithm(num_of_asks, input_data, stream_size, bx):

    for i in range(0, num_of_asks):
        input_par = bx.ask(input_data, stream_size)
        BloomFiltering(input_par)

    return


with open(output_file, "w") as f:
    f.write('Time,FPR' + '\n')

for i in range(0, num_of_asks):
    item = bx.ask(input_data, stream_size)
    BloomFiltering(item)
Ejemplo n.º 23
0
    from blackbox import BlackBox
    import sys
    import time


    start = time.time()
    file_name = sys.argv[1]
    stream_size = int(sys.argv[2])
    num_ask = int(sys.argv[3])
    output_path = sys.argv[4]

    random.seed(553)
    bx = BlackBox()
    data_holder = []
    n = 0
    res = []
    for _ in range(num_ask):
        stream_users = bx.ask(file_name, stream_size)
        for user in stream_users:
            n += 1
            if len(data_holder) < 100:
                data_holder.append(user)
            else:
                prob = random.random()
                if prob < 100 / n:
                    idx = random.randint(0, 99)
                    data_holder[idx] = user
        res.append((n, data_holder[0], data_holder[20], data_holder[40], data_holder[60], data_holder[80]))
    export_file(res, output_path)
    end = time.time()
    print(f'finished within {end - start} seconds! ')
Ejemplo n.º 24
0
    estimate = (int)(statistics.median((avg_group)))

    with open(output, "a") as f:
        f.write(str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
        f.write("," + str(ground_truth))
        f.write("," + str(estimate))
        f.write("\n")
        print(
            str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + " " +
            str(ground_truth) + " " + str(estimate))


if __name__ == '__main__':
    file = "users.txt"
    num = 300
    times = 30
    output = "FlajoletMartin_output.csv"

    # num of hash_functions
    k = 15
    # groups
    g = 5
    # num in group
    l = 3
    with open(output, 'w+') as f:
        f.write("Time,Ground Truth,Estimation")
        f.write("\n")
    bx = BlackBox()
    for i in range(times):
        flajolet(bx.ask(file, num))
Ejemplo n.º 25
0
    # calculate FPR = FP / (FP + TN)
    false_positive_rate = 0.0 if (false_pos + true_neg
                                  == 0) else false_pos / (false_pos + true_neg)

    with open(output, 'a') as f:
        f.write(str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
        f.write("," + str(float(false_positive_rate)))
        f.write("\n")
        print(
            str(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + " " +
            str(float(false_positive_rate)))


if __name__ == '__main__':
    file = "users.txt"
    num = 100
    times = 30
    output = "Bloomfilter_output.csv"

    filter_array = [0 for i in range(69997)]
    seen_user = set()

    with open(output, 'w+') as f:
        f.write("Time,FPR")
        f.write("\n")

    from blackbox import BlackBox
    bx = BlackBox()
    for i in range(times):
        bloom(bx.ask(file, num))
Ejemplo n.º 26
0
# For each line, the first column is the sequence number (starting from 1)
# of the latest user in the entire streaming, then the 1th user (with index 0 in your list),
# 21th user, 41th user, 61th user and 81th user in your reservoir.
reservoir = []
out_file = sys.argv[4]
with open(out_file, 'w') as writeFile:
    print("In write")
    writeFile.write("seqnum,0_id,20_id,40_id,60_id,80_id\n")

    total_count = 0
    n = 100
    for _ in range(num_asks):
        # print("This is ask:",_+1)
        # Obtain the users
        # print("This is ask:",_+1)
        users = bx.ask(users_fn, 100)

        if _ == 0:
            reservoir.extend(users)
            #print("This is reservoir:", reservoir)
            writeFile.write(
                str(n) + "," + reservoir[0] + "," + reservoir[20] + "," +
                reservoir[40] + "," + reservoir[60] + "," + reservoir[80] +
                "\n")
            continue

        count = 0
        for user in users:
            n = n + 1
            # prob_keep = 100/len(actual_stream)
            prob_keep = len(reservoir)