Ejemplo n.º 1
0
def worst_case_error():
    t = 25
    lower = 0 - t
    upper = 50 + t
    s = 250
    u = upper - lower

    random_list = np.random.uniform(low=lower, high=upper, size=s)
    random_uniform_list = np.asarray(
        [lower + (i + 1) * (upper - lower) / (s + 1) for i in range(s)])

    run_time = 500
    err_list, err_list_improved = [], []
    for i in range(run_time):
        a = np.random.uniform(low=0, high=50, size=1)[0]
        b = np.random.uniform(low=0, high=50, size=1)[0]
        # print("a=", a, "b=", b)
        true_distance = np.fabs(a - b)

        bv_a = bv.bv_encode(item=a, random_list=random_list, t=t)
        bv_b = bv.bv_encode(item=b, random_list=random_list, t=t)
        estimate_distance = bv.bv_decode(bv_a, bv_b, u)

        ibv_a = bv.bv_encode(item=a, random_list=random_uniform_list, t=t)
        ibv_b = bv.bv_encode(item=b, random_list=random_uniform_list, t=t)
        estimate_distance_imporved = bv.bv_decode(ibv_a, ibv_b, u)

        # print(true_distance, estimate_distance, estimate_distance_imporved)
        err_list.append(np.fabs(true_distance - estimate_distance))
        err_list_improved.append(
            np.fabs(true_distance - estimate_distance_imporved))

    for i in range(len(err_list)):
        print(err_list[i], err_list_improved[i])
Ejemplo n.º 2
0
def worst_error_with_s():

    t = 25
    lower = 0 - t
    upper = 50 + t
    u = upper - lower

    for s in range(50, 2001, 50):
        random_list = np.random.uniform(low=lower, high=upper, size=s)
        # random_uniform_list = np.asarray([lower+(i+1) * (upper-lower) / (s+1) for i in range(s)])

        run_time = 4000
        max_err, max_err_improved = 0, 0
        for i in range(run_time):
            a = np.random.uniform(low=0, high=50, size=1)[0]
            b = np.random.uniform(low=0, high=50, size=1)[0]
            # print("a=", a, "b=", b)
            true_distance = np.fabs(a - b)

            bv_a = bv.bv_encode(item=a, random_list=random_list, t=t)
            bv_b = bv.bv_encode(item=b, random_list=random_list, t=t)
            estimate_distance = bv.bv_decode(bv_a, bv_b, u)

            # ibv_a = bv.bv_encode(item=a, random_list=random_uniform_list, t=t)
            # ibv_b = bv.bv_encode(item=b, random_list=random_uniform_list, t=t)
            # estimate_distance_imporved = bv.bv_decode(ibv_a, ibv_b, u)

            if np.fabs(true_distance - estimate_distance) > max_err:
                max_err = np.fabs(true_distance - estimate_distance)
            # if np.fabs(true_distance-estimate_distance_imporved) > max_err_improved:
            #     max_err_improved = np.fabs(true_distance-estimate_distance_imporved)

        print(s, max_err)
Ejemplo n.º 3
0
def record_linkage_with_age_dataset_threshold_group():
    age_list = reader.read_age("../data/test.txt")
    print(max(age_list))
    print(min(age_list))
    print(len(age_list))

    t = 50
    lower = 0 - t
    upper = 100 + t
    s = 200
    u = upper - lower

    list_number = 1000
    value_list_a = age_list[0:list_number]
    value_list_b = age_list[list_number+1:2*list_number]

    threshold_list = [3,4,5,6,7,8,9,10]
    threshold_times = 1.1
    for threshold in threshold_list:
        random_list = np.random.uniform(low=lower, high=upper, size=s)
        value_bv_list_a = np.asarray([bv.bv_encode(item=value_list_a[i], random_list=random_list, t=t) for i in range(len(value_list_a))])
        value_bv_list_b = np.asarray([bv.bv_encode(item=value_list_b[i], random_list=random_list, t=t) for i in range(len(value_list_b))])
        groundtruth_true, groundtruth_false = get_groundtruth(value_list_a, value_list_b, threshold)
        ret_true, ret_false, time_with_group1 = compare_with_group(value_bv_list_a, value_bv_list_b, threshold, u, threshold_times)
        precision_bv, recall_bv, fscore_bv = analyse_result(groundtruth_true, groundtruth_false, ret_true, ret_false)

        random_uniform_list = np.asarray([lower + (i + 1) * (upper - lower) / (s + 1) for i in range(s)])
        value_bv_list_a = np.asarray([bv.bv_encode(item=value_list_a[i], random_list=random_uniform_list, t=t) for i in range(len(value_list_a))])
        value_bv_list_b = np.asarray([bv.bv_encode(item=value_list_b[i], random_list=random_uniform_list, t=t) for i in range(len(value_list_b))])
        groundtruth_true, groundtruth_false = get_groundtruth(value_list_a, value_list_b, threshold)
        ret_true, ret_false, time_with_group2 = compare_with_group(value_bv_list_a, value_bv_list_b, threshold, u, threshold_times)
        precision_ibv, recall_ibv, fscore_ibv = analyse_result(groundtruth_true, groundtruth_false, ret_true, ret_false)

        print(threshold, precision_bv, recall_bv, fscore_bv, precision_ibv, recall_ibv, fscore_ibv, time_with_group1, time_with_group2)
Ejemplo n.º 4
0
def record_linkage_with_age_dataset_threshold():
    age_list = reader.read_age("../data/test.txt")
    t = 50
    lower = 0 - t
    upper = 100 + t
    s = 200
    u = upper - lower

    list_number = 1000
    value_list_a = age_list[0:list_number]
    value_list_b = age_list[list_number+1:2*list_number]

    threshold_list = [5]
    group_number_list = [i for i in range(2, 11, 2)]
    times_list = [i / 10 for i in range(2, 32, 2)]
    threshold_times = 1.1
    for threshold in threshold_list:
        random_list = np.random.uniform(low=lower, high=upper, size=s)
        value_bv_list_a = np.asarray([bv.bv_encode(item=value_list_a[i], random_list=random_list, t=t) for i in range(len(value_list_a))])
        value_bv_list_b = np.asarray([bv.bv_encode(item=value_list_b[i], random_list=random_list, t=t) for i in range(len(value_list_b))])
        groundtruth_true, groundtruth_false = get_groundtruth(value_list_a, value_list_b, threshold)
        ret_true, ret_false, time_without_group = compare_without_group(value_bv_list_a, value_bv_list_b, threshold, u, threshold_times)
        precision_bv, recall_bv, fscore_bv = analyse_result(groundtruth_true, groundtruth_false, ret_true, ret_false)
        print(precision_bv, recall_bv, fscore_bv, time_without_group)
        for group_number in group_number_list:
            for times in times_list:
                print(group_number, times)
                groundtruth_true, groundtruth_false = get_groundtruth(value_list_a, value_list_b, threshold)
                ret_true, ret_false, time_with_group, group_time = compare_with_group(value_bv_list_a, value_bv_list_b, threshold, u, threshold_times, times, group_number)
                precision_bv, recall_bv, fscore_bv = analyse_result(groundtruth_true, groundtruth_false, ret_true, ret_false)
                print(precision_bv, recall_bv, fscore_bv, time_with_group, group_time)
Ejemplo n.º 5
0
def my_test():
    t = 25
    lower = 0 - t
    upper = 50 + t
    s = 2000
    u = upper - lower

    len_list_a = 50
    len_list_b = 100
    value_list_a = np.random.randint(0, 25, len_list_a)
    value_list_b = np.random.randint(0, 25, len_list_b)
    random_list = np.random.uniform(low=lower, high=upper, size=s)

    value_bv_list_a = np.asarray([bv.bv_encode(item=value_list_a[i], random_list=random_list, t=t) for i in range(len(value_list_a))])
    value_bv_list_b = np.asarray([bv.bv_encode(item=value_list_b[i], random_list=random_list, t=t) for i in range(len(value_list_b))])

    threshold = 5
    threshold_times = 1.15
    groundtruth_true, groundtruth_false = get_groundtruth(value_list_a, value_list_b, threshold)
    ret_true, ret_false, time_without_group = compare_without_group(value_bv_list_a, value_bv_list_b, threshold, u, threshold_times)
    print("time caused in compare_without_group : " + str(time_without_group) + " seconds")

    precision, recall, fscore = analyse_result(groundtruth_true, groundtruth_false, ret_true, ret_false)
    print("precision = " + str(precision))
    print("recall = " + str(recall))
    print("fscore = " + str(fscore))

    compare_with_group(value_bv_list_a, value_bv_list_b, threshold, threshold_times)
Ejemplo n.º 6
0
def mytest():
    t = 25
    lower = 0 - t
    upper = 50 + t
    u = upper - lower
    s = 10000
    random_list = np.random.uniform(low=lower, high=upper, size=s)


    times = 100
    for i in range(times):
        a = np.random.uniform(low=0, high=50, size=1)[0]
        b = np.random.uniform(low=0, high=50, size=1)[0]
        true_distance = np.fabs(a-b)

        bv_a = bv.bv_encode(item=a, random_list=random_list, t=t)
        bv_b = bv.bv_encode(item=b, random_list=random_list, t=t)
        estimate_distance = bv.bv_decode(bv_a, bv_b, u)

        print("true distance = " + str(true_distance))
        print("estimate distance = " + str(estimate_distance))
Ejemplo n.º 7
0
def ibv_hist_test():
    t = 50
    lower = 0 - t
    upper = 100 + t
    s = 1000
    u = upper-lower


    # datalist = np.asarray(reader.read_age("../data/test.txt"))
    # datalist = datalist[np.where(datalist <= 100)]
    # datalist = datalist[:500]
    datalist = [i for i in range(100)]
    print(str(len(datalist)) + " records have been loaded! With max = " + str(max(datalist)) + ", min = " + str(min(datalist)) + "!")
    print(datalist)

    random_list = np.random.uniform(low=lower, high=upper, size=s)
    # random_list = np.asarray([lower+(i+1) * (upper-lower) / (s+1) for i in range(s)])

    ibv_list = [bv.bv_encode(datalist[i], random_list, t) for i in range(len(datalist))]
    dist_matrix = np.zeros(shape=[len(datalist), len(datalist)])

    i_index = 0
    j_index = 0
    max_index = 0
    for i in range(len(datalist)):
        for j in range(i, len(datalist)):
            dist_matrix[i][j] = bv.bv_decode(ibv_list[i], ibv_list[j], u)
            dist_matrix[j][i] = dist_matrix[i][j]
            if dist_matrix[i][j] >= max_index:
                i_index, j_index, max_index = i, j, dist_matrix[i][j]
    print(dist_matrix)
    print(i_index, j_index, max_index)

    # mark i_index as min of data, j_index as max of data
    estimate_data_list = []
    for i in range(len(ibv_list)):
        estimate_value = int(0 + (dist_matrix[i][i_index])/(max_index) * 100)
        estimate_data_list.append(estimate_value)

    original_hist = get_original_hist(datalist)
    estimate_hist = get_original_hist(estimate_data_list)

    print("original_hist:" + str(original_hist))
    print("estimate_hist:" + str(estimate_hist))
    print("error_of_hist:" + str(error_of_hist(original_hist, estimate_hist)))
Ejemplo n.º 8
0
            break
    for i in range(l):
        origin[i] = bv.bv_decode(bv_list[zero_index], bv_list[i], u)
    return origin


if __name__ == '__main__':
    datalist = np.asarray(reader.read_age("../data/test.txt"))
    datalist = datalist[np.where(datalist < 100)]
    # print(datalist)
    print(len(datalist))
    # plt.hist(datalist)
    # plt.show()
    t = 50
    lower = 0 - t
    upper = 100 + t
    s = 2000
    u = upper - lower
    random_list = np.random.uniform(low=lower, high=upper, size=s)
    bv_list = [
        bv.bv_encode(item=datalist[i], random_list=random_list, t=t)
        for i in range(len(datalist))
    ]
    l = len(bv_list)
    print(l)
    origin = restore_data(bv_list, u)
    print(origin)
    plt.hist(origin)
    plt.show()
    pass