def worst_case_error(): t = 25 lower = 0 - t upper = 50 + t s = 250 u = upper - lower random_list = np.random.uniform(low=lower, high=upper, size=s) random_uniform_list = np.asarray( [lower + (i + 1) * (upper - lower) / (s + 1) for i in range(s)]) run_time = 500 err_list, err_list_improved = [], [] for i in range(run_time): a = np.random.uniform(low=0, high=50, size=1)[0] b = np.random.uniform(low=0, high=50, size=1)[0] # print("a=", a, "b=", b) true_distance = np.fabs(a - b) bv_a = bv.bv_encode(item=a, random_list=random_list, t=t) bv_b = bv.bv_encode(item=b, random_list=random_list, t=t) estimate_distance = bv.bv_decode(bv_a, bv_b, u) ibv_a = bv.bv_encode(item=a, random_list=random_uniform_list, t=t) ibv_b = bv.bv_encode(item=b, random_list=random_uniform_list, t=t) estimate_distance_imporved = bv.bv_decode(ibv_a, ibv_b, u) # print(true_distance, estimate_distance, estimate_distance_imporved) err_list.append(np.fabs(true_distance - estimate_distance)) err_list_improved.append( np.fabs(true_distance - estimate_distance_imporved)) for i in range(len(err_list)): print(err_list[i], err_list_improved[i])
def worst_error_with_s(): t = 25 lower = 0 - t upper = 50 + t u = upper - lower for s in range(50, 2001, 50): random_list = np.random.uniform(low=lower, high=upper, size=s) # random_uniform_list = np.asarray([lower+(i+1) * (upper-lower) / (s+1) for i in range(s)]) run_time = 4000 max_err, max_err_improved = 0, 0 for i in range(run_time): a = np.random.uniform(low=0, high=50, size=1)[0] b = np.random.uniform(low=0, high=50, size=1)[0] # print("a=", a, "b=", b) true_distance = np.fabs(a - b) bv_a = bv.bv_encode(item=a, random_list=random_list, t=t) bv_b = bv.bv_encode(item=b, random_list=random_list, t=t) estimate_distance = bv.bv_decode(bv_a, bv_b, u) # ibv_a = bv.bv_encode(item=a, random_list=random_uniform_list, t=t) # ibv_b = bv.bv_encode(item=b, random_list=random_uniform_list, t=t) # estimate_distance_imporved = bv.bv_decode(ibv_a, ibv_b, u) if np.fabs(true_distance - estimate_distance) > max_err: max_err = np.fabs(true_distance - estimate_distance) # if np.fabs(true_distance-estimate_distance_imporved) > max_err_improved: # max_err_improved = np.fabs(true_distance-estimate_distance_imporved) print(s, max_err)
def record_linkage_with_age_dataset_threshold_group(): age_list = reader.read_age("../data/test.txt") print(max(age_list)) print(min(age_list)) print(len(age_list)) t = 50 lower = 0 - t upper = 100 + t s = 200 u = upper - lower list_number = 1000 value_list_a = age_list[0:list_number] value_list_b = age_list[list_number+1:2*list_number] threshold_list = [3,4,5,6,7,8,9,10] threshold_times = 1.1 for threshold in threshold_list: random_list = np.random.uniform(low=lower, high=upper, size=s) value_bv_list_a = np.asarray([bv.bv_encode(item=value_list_a[i], random_list=random_list, t=t) for i in range(len(value_list_a))]) value_bv_list_b = np.asarray([bv.bv_encode(item=value_list_b[i], random_list=random_list, t=t) for i in range(len(value_list_b))]) groundtruth_true, groundtruth_false = get_groundtruth(value_list_a, value_list_b, threshold) ret_true, ret_false, time_with_group1 = compare_with_group(value_bv_list_a, value_bv_list_b, threshold, u, threshold_times) precision_bv, recall_bv, fscore_bv = analyse_result(groundtruth_true, groundtruth_false, ret_true, ret_false) random_uniform_list = np.asarray([lower + (i + 1) * (upper - lower) / (s + 1) for i in range(s)]) value_bv_list_a = np.asarray([bv.bv_encode(item=value_list_a[i], random_list=random_uniform_list, t=t) for i in range(len(value_list_a))]) value_bv_list_b = np.asarray([bv.bv_encode(item=value_list_b[i], random_list=random_uniform_list, t=t) for i in range(len(value_list_b))]) groundtruth_true, groundtruth_false = get_groundtruth(value_list_a, value_list_b, threshold) ret_true, ret_false, time_with_group2 = compare_with_group(value_bv_list_a, value_bv_list_b, threshold, u, threshold_times) precision_ibv, recall_ibv, fscore_ibv = analyse_result(groundtruth_true, groundtruth_false, ret_true, ret_false) print(threshold, precision_bv, recall_bv, fscore_bv, precision_ibv, recall_ibv, fscore_ibv, time_with_group1, time_with_group2)
def record_linkage_with_age_dataset_threshold(): age_list = reader.read_age("../data/test.txt") t = 50 lower = 0 - t upper = 100 + t s = 200 u = upper - lower list_number = 1000 value_list_a = age_list[0:list_number] value_list_b = age_list[list_number+1:2*list_number] threshold_list = [5] group_number_list = [i for i in range(2, 11, 2)] times_list = [i / 10 for i in range(2, 32, 2)] threshold_times = 1.1 for threshold in threshold_list: random_list = np.random.uniform(low=lower, high=upper, size=s) value_bv_list_a = np.asarray([bv.bv_encode(item=value_list_a[i], random_list=random_list, t=t) for i in range(len(value_list_a))]) value_bv_list_b = np.asarray([bv.bv_encode(item=value_list_b[i], random_list=random_list, t=t) for i in range(len(value_list_b))]) groundtruth_true, groundtruth_false = get_groundtruth(value_list_a, value_list_b, threshold) ret_true, ret_false, time_without_group = compare_without_group(value_bv_list_a, value_bv_list_b, threshold, u, threshold_times) precision_bv, recall_bv, fscore_bv = analyse_result(groundtruth_true, groundtruth_false, ret_true, ret_false) print(precision_bv, recall_bv, fscore_bv, time_without_group) for group_number in group_number_list: for times in times_list: print(group_number, times) groundtruth_true, groundtruth_false = get_groundtruth(value_list_a, value_list_b, threshold) ret_true, ret_false, time_with_group, group_time = compare_with_group(value_bv_list_a, value_bv_list_b, threshold, u, threshold_times, times, group_number) precision_bv, recall_bv, fscore_bv = analyse_result(groundtruth_true, groundtruth_false, ret_true, ret_false) print(precision_bv, recall_bv, fscore_bv, time_with_group, group_time)
def my_test(): t = 25 lower = 0 - t upper = 50 + t s = 2000 u = upper - lower len_list_a = 50 len_list_b = 100 value_list_a = np.random.randint(0, 25, len_list_a) value_list_b = np.random.randint(0, 25, len_list_b) random_list = np.random.uniform(low=lower, high=upper, size=s) value_bv_list_a = np.asarray([bv.bv_encode(item=value_list_a[i], random_list=random_list, t=t) for i in range(len(value_list_a))]) value_bv_list_b = np.asarray([bv.bv_encode(item=value_list_b[i], random_list=random_list, t=t) for i in range(len(value_list_b))]) threshold = 5 threshold_times = 1.15 groundtruth_true, groundtruth_false = get_groundtruth(value_list_a, value_list_b, threshold) ret_true, ret_false, time_without_group = compare_without_group(value_bv_list_a, value_bv_list_b, threshold, u, threshold_times) print("time caused in compare_without_group : " + str(time_without_group) + " seconds") precision, recall, fscore = analyse_result(groundtruth_true, groundtruth_false, ret_true, ret_false) print("precision = " + str(precision)) print("recall = " + str(recall)) print("fscore = " + str(fscore)) compare_with_group(value_bv_list_a, value_bv_list_b, threshold, threshold_times)
def mytest(): t = 25 lower = 0 - t upper = 50 + t u = upper - lower s = 10000 random_list = np.random.uniform(low=lower, high=upper, size=s) times = 100 for i in range(times): a = np.random.uniform(low=0, high=50, size=1)[0] b = np.random.uniform(low=0, high=50, size=1)[0] true_distance = np.fabs(a-b) bv_a = bv.bv_encode(item=a, random_list=random_list, t=t) bv_b = bv.bv_encode(item=b, random_list=random_list, t=t) estimate_distance = bv.bv_decode(bv_a, bv_b, u) print("true distance = " + str(true_distance)) print("estimate distance = " + str(estimate_distance))
def ibv_hist_test(): t = 50 lower = 0 - t upper = 100 + t s = 1000 u = upper-lower # datalist = np.asarray(reader.read_age("../data/test.txt")) # datalist = datalist[np.where(datalist <= 100)] # datalist = datalist[:500] datalist = [i for i in range(100)] print(str(len(datalist)) + " records have been loaded! With max = " + str(max(datalist)) + ", min = " + str(min(datalist)) + "!") print(datalist) random_list = np.random.uniform(low=lower, high=upper, size=s) # random_list = np.asarray([lower+(i+1) * (upper-lower) / (s+1) for i in range(s)]) ibv_list = [bv.bv_encode(datalist[i], random_list, t) for i in range(len(datalist))] dist_matrix = np.zeros(shape=[len(datalist), len(datalist)]) i_index = 0 j_index = 0 max_index = 0 for i in range(len(datalist)): for j in range(i, len(datalist)): dist_matrix[i][j] = bv.bv_decode(ibv_list[i], ibv_list[j], u) dist_matrix[j][i] = dist_matrix[i][j] if dist_matrix[i][j] >= max_index: i_index, j_index, max_index = i, j, dist_matrix[i][j] print(dist_matrix) print(i_index, j_index, max_index) # mark i_index as min of data, j_index as max of data estimate_data_list = [] for i in range(len(ibv_list)): estimate_value = int(0 + (dist_matrix[i][i_index])/(max_index) * 100) estimate_data_list.append(estimate_value) original_hist = get_original_hist(datalist) estimate_hist = get_original_hist(estimate_data_list) print("original_hist:" + str(original_hist)) print("estimate_hist:" + str(estimate_hist)) print("error_of_hist:" + str(error_of_hist(original_hist, estimate_hist)))
break for i in range(l): origin[i] = bv.bv_decode(bv_list[zero_index], bv_list[i], u) return origin if __name__ == '__main__': datalist = np.asarray(reader.read_age("../data/test.txt")) datalist = datalist[np.where(datalist < 100)] # print(datalist) print(len(datalist)) # plt.hist(datalist) # plt.show() t = 50 lower = 0 - t upper = 100 + t s = 2000 u = upper - lower random_list = np.random.uniform(low=lower, high=upper, size=s) bv_list = [ bv.bv_encode(item=datalist[i], random_list=random_list, t=t) for i in range(len(datalist)) ] l = len(bv_list) print(l) origin = restore_data(bv_list, u) print(origin) plt.hist(origin) plt.show() pass