def get_result_dataset(data, k=10, num_test=10):
    """
    fix k and QI, while changing size of dataset
    num_test is the test nubmber.
    """
    data_back = copy.deepcopy(data)
    length = len(data_back)
    joint = 5000
    datasets = []
    check_time = length / joint
    if length % joint == 0:
        check_time -= 1
    for i in range(check_time):
        datasets.append(joint * (i + 1))
    datasets.append(length)
    for pos in datasets:
        print '#' * 30
        print "size of dataset %d" % pos
        for j in range(num_test):
            temp = random.sample(data, pos)
            result, eval_result = mondrian(temp, k, RELAX)
            if DATA_SELECT == 'a':
                result = covert_to_raw(result)
            ncp += eval_result[0]
            rtime += eval_result[1]
            data = copy.deepcopy(data_back)
        ncp /= num_test
        rtime /= num_test
        print "Average NCP %0.2f" % ncp + "%"
        print "Running time %0.2f" % rtime + " seconds"
        print '#' * 30
Exemple #2
0
def get_result_dataset(data, k=10, num_test=10):
    """
    fix k and QI, while changing size of dataset
    num_test is the test nubmber.
    """
    data_back = copy.deepcopy(data)
    length = len(data_back)
    joint = 5000
    dataset_num = length / joint
    if length % joint == 0:
        dataset_num += 1
    for i in range(1, dataset_num + 1):
        pos = i * joint
        ncp = rtime = 0
        if pos > length:
            continue
        print '#' * 30
        print "size of dataset %d" % pos
        for j in range(num_test):
            temp = random.sample(data, pos)
            result, eval_result = mondrian(temp, k, RELAX)
            if DATA_SELECT == 'a':
                result = covert_to_raw(result)
            ncp += eval_result[0]
            rtime += eval_result[1]
            data = copy.deepcopy(data_back)
        ncp /= num_test
        rtime /= num_test
        print "Average NCP %0.2f" % ncp + "%"
        print "Running time %0.2f" % rtime + " seconds"
        print '#' * 30
Exemple #3
0
def get_result_dataset(data, k=10, num_test=10):
    """
    fix k and QI, while changing size of data set
    num_test is the test number.
    固定k和QI,同时更改数据集
    num_test的大小是测试编号。
    """
    data_back = copy.deepcopy(data)
    length = len(data_back)
    joint = 5000
    datasets = []
    check_time = length / joint
    if length % joint == 0:
        check_time -= 1
    for i in range(int(check_time)):
        datasets.append(joint * (i + 1))
    datasets.append(length)
    ncp = 0
    rtime = 0
    for pos in datasets:
        print('#' * 30)
        print("size of dataset %d" % pos)
        for j in range(num_test):
            temp = random.sample(data, pos)
            result, eval_result = mondrian(temp, k, RELAX)
            result = covert_to_raw(result)
            ncp += eval_result[0]
            rtime += eval_result[1]
            data = copy.deepcopy(data_back)
        ncp /= num_test
        rtime /= num_test
        print("Average NCP %0.2f" % ncp + "%")
        print("Running time %0.2f" % rtime + " seconds")
        print('#' * 30)
Exemple #4
0
 def test1_mondrian_strict(self):
     data = [[6, 1, 'haha'], [6, 1, 'test'], [8, 2, 'haha'], [8, 2, 'test'],
             [4, 1, 'hha'], [4, 2, 'hha'], [4, 3, 'hha'], [4, 4, 'hha']]
     result, eval_r = mondrian(data, 2, False)
     # print result
     # print eval_r
     self.assertTrue(abs(eval_r[0] - 100.0 / 12) < 0.05)
Exemple #5
0
def get_result_dataset(data, k=10, num_test=10):
    """
    fix k and QI, while changing size of dataset
    num_test is the test nubmber.
    """
    data_back = copy.deepcopy(data)
    length = len(data_back)
    joint = 5000
    datasets = []
    check_time = length / joint
    if length % joint == 0:
        check_time -= 1
    for i in range(check_time):
        datasets.append(joint * (i + 1))
    datasets.append(length)
    for pos in datasets:
        print "#" * 30
        print "size of dataset %d" % pos
        for j in range(num_test):
            temp = random.sample(data, pos)
            result, eval_result = mondrian(temp, k, RELAX)
            if DATA_SELECT == "a":
                result = covert_to_raw(result)
            ncp += eval_result[0]
            rtime += eval_result[1]
            data = copy.deepcopy(data_back)
        ncp /= num_test
        rtime /= num_test
        print "Average NCP %0.2f" % ncp + "%"
        print "Running time %0.2f" % rtime + " seconds"
        print "#" * 30
Exemple #6
0
def get_result_one(att_trees, data, k=DEFAULT_K):
    "run basic_mondrian for one time, with k=10"
    print "K=%d" % k
    print "Mondrian"
    data_back = copy.deepcopy(data)
    _, eval_result = mondrian(att_trees, data, k)
    print "NCP %0.2f" % eval_result[0] + "%"
    print "Running time %0.2f" % eval_result[1] + "seconds"
def get_result_one(att_trees, data, k=DEFAULT_K):
    "run basic_mondrian for one time, with k=10"
    print "K=%d" % k
    print "Mondrian"
    result, eval_result = mondrian(att_trees, data, k)
    write_to_file(result)
    print "NCP %0.2f" % eval_result[0] + "%"
    print "Running time %0.2f" % eval_result[1] + "seconds"
def get_result_one(att_trees, data, k=DEFAULT_K):
    "run basic_mondrian for one time, with k=10"
    print "K=%d" % k
    print "Mondrian"
    data_back = copy.deepcopy(data)
    _, eval_result = mondrian(att_trees, data, k)
    print "NCP %0.2f" % eval_result[0] + "%"
    print "Running time %0.2f" % eval_result[1] + "seconds"
Exemple #9
0
 def test2_mondrian_relax(self):
     data = [[6, 1, 'haha'], [8, 1, 'haha'], [8, 1, 'test'], [8, 1, 'haha'],
             [8, 1, 'test'], [4, 1, 'hha'], [4, 2, 'hha'], [4, 3, 'hha'],
             [4, 4, 'hha']]
     result, eval_r = mondrian(data, 2, True)
     # print result
     # print eval_r
     self.assertTrue(abs(eval_r[0] - 700.0 / 54) < 0.05)
 def test2_mondrian(self):
     init()
     data = [['6', '1', 'haha'], ['6', '1', 'test'], ['8', '2', 'haha'],
             ['8', '2', 'test'], ['4', '1', 'hha'], ['4', '1', 'hha'],
             ['1', '1', 'hha'], ['2', '1', 'hha']]
     result, eval_r = mondrian(ATT_TREE, data, 2)
     # print result
     # print eval_r
     self.assertTrue(abs(eval_r[0] - 100.0 / 8) < 0.05)
 def test_mondrian_incompelte(self):
     init()
     data = [['6', '?', 'haha'], ['6', '?', 'test'], ['8', '2', 'haha'],
             ['8', '2', 'test'], ['4', '?', 'hha'], ['4', '?', 'hha'],
             ['4', '3', 'hha'], ['4', '4', 'hha']]
     result, eval_r = mondrian(ATT_TREE, data, 2)
     # print result
     # print eval_r
     self.assertTrue(abs(eval_r[0] - 200.0 / 144) < 0.05)
Exemple #12
0
def get_result_one(data, k=10):
    """
    run mondrian for one time, with k=10
    """
    print "K=%d" % k
    data_back = copy.deepcopy(data)
    _, eval_result = mondrian(data, k, RELAX)
    data = copy.deepcopy(data_back)
    print "NCP %0.2f" % eval_result[0] + "%"
    print "Running time %0.2f" % eval_result[1] + " seconds"
Exemple #13
0
 def test_mondrian_datetime(self):
     d1 = datetime.strptime("2007-03-04 21:08:12", "%Y-%m-%d %H:%M:%S")
     d2 = datetime.strptime("2008-03-04 21:08:12", "%Y-%m-%d %H:%M:%S")
     d3 = datetime.strptime("2009-03-04 21:08:12", "%Y-%m-%d %H:%M:%S")
     d4 = datetime.strptime("2007-03-05 21:08:12", "%Y-%m-%d %H:%M:%S")
     data = [[6, d1, 'haha'], [8, d1, 'haha'], [8, d1, 'test'],
             [8, d1, 'haha'], [8, d1, 'test'], [4, d1, 'hha'],
             [4, d2, 'hha'], [4, d3, 'hha'], [4, d4, 'hha']]
     result, eval_r = mondrian(data, 2, False)
     print(eval_r)
def get_result_one(data, k=10):
    """
    run mondrian for one time, with k=10
    """
    print "K=%d" % k
    data_back = copy.deepcopy(data)
    result, eval_result = mondrian(data, k, RELAX)
    if DATA_SELECT == 'a':
        result = covert_to_raw(result)
    data = copy.deepcopy(data_back)
    print "NCP %0.2f" % eval_result[0] + "%"
    print "Running time %0.2f" % eval_result[1] + " seconds"
Exemple #15
0
def get_result_one(data, k=10):
    """
    run mondrian for one time, with k=10
    """
    print "K=%d" % k
    data_back = copy.deepcopy(data)
    result, eval_result = mondrian(data, k, RELAX)
    if DATA_SELECT == 'a':
        result = covert_to_raw(result)
    data = copy.deepcopy(data_back)
    print "NCP %0.2f" % eval_result[0] + "%"
    print "Running time %0.2f" % eval_result[1] + " seconds"
Exemple #16
0
 def test1_mondrian_strict(self):
     data = [[6, 1, 'haha'],
             [6, 1, 'test'],
             [8, 2, 'haha'],
             [8, 2, 'test'],
             [4, 1, 'hha'],
             [4, 2, 'hha'],
             [4, 3, 'hha'],
             [4, 4, 'hha']]
     result, eval_r = mondrian(data, 2, False)
     # print result
     # print eval_r
     self.assertTrue(abs(eval_r[0] - 100.0 / 12) < 0.05)
Exemple #17
0
def get_result_qi(data, k=10):
    """
    change nubmber of QI, whle fixing k and size of dataset
    """
    data_back = copy.deepcopy(data)
    num_data = len(data[0])
    for i in reversed(range(1, num_data)):
        print '#' * 30
        print "Number of QI=%d" % i
        _, eval_result = mondrian(data, k, RELAX, i)
        data = copy.deepcopy(data_back)
        print "NCP %0.2f" % eval_result[0] + "%"
        print "Running time %0.2f" % eval_result[1] + " seconds"
Exemple #18
0
def get_result_k(data):
    """
    change k, whle fixing QD and size of dataset
    """
    data_back = copy.deepcopy(data)
    # for k in [2, 5, 10, 25, 50, 100]:
    for k in range(5, 105, 5):
        print '#' * 30
        print "K=%d" % k
        result, eval_result = mondrian(data, k, RELAX)
        data = copy.deepcopy(data_back)
        print "NCP %0.2f" % eval_result[0] + "%"
        print "Running time %0.2f" % eval_result[1] + " seconds"
Exemple #19
0
 def test2_mondrian(self):
     init()
     data = [['6', '1', 'haha'],
             ['6', '1', 'test'],
             ['8', '2', 'haha'],
             ['8', '2', 'test'],
             ['4', '1', 'hha'],
             ['4', '1', 'hha'],
             ['1', '1', 'hha'],
             ['2', '1', 'hha']]
     result, eval_r = mondrian(ATT_TREE, data, 2)
     # print result
     # print eval_r
     self.assertTrue(abs(eval_r[0] - 100.0 / 8) < 0.05)
 def test_mondrian_incompelte(self):
     init()
     data = [['6', '?', 'haha'],
             ['6', '?', 'test'],
             ['8', '2', 'haha'],
             ['8', '2', 'test'],
             ['4', '?', 'hha'],
             ['4', '?', 'hha'],
             ['4', '3', 'hha'],
             ['4', '4', 'hha']]
     result, eval_r = mondrian(ATT_TREE, data, 2)
     # print result
     # print eval_r
     self.assertTrue(abs(eval_r[0] - 200.0 / 144) < 0.05)
Exemple #21
0
def get_result_k(data):
    """
    change k, while fixing QD and size of data set
    """
    data_back = copy.deepcopy(data)
    for k in range(1, 15, 1):
        print('#' * 30)
        print("K=%d" % k)
        result, eval_result = mondrian(data, k, RELAX)
        if DATA_SELECT == 'a':
            result = covert_to_raw(result)
        data = copy.deepcopy(data_back)
        print("NCP %0.2f" % eval_result[0] + "%")
        print("Running time %0.2f" % eval_result[1] + " seconds")
Exemple #22
0
def get_result_qi(data, k=10):
    """
    change number of QI, while fixing k and size of data set
    """
    data_back = copy.deepcopy(data)
    num_data = len(data[0])
    for i in reversed(list(range(1, num_data))):
        print('#' * 30)
        print("Number of QI=%d" % i)
        result, eval_result = mondrian(data, k, RELAX, i)
        result = covert_to_raw(result)
        data = copy.deepcopy(data_back)
        print("NCP %0.2f" % eval_result[0] + "%")
        print("Running time %0.2f" % eval_result[1] + " seconds")
Exemple #23
0
def get_result_k(data):
    """
    change k, while fixing QD and size of data set
    更改k,同时固定QD和数据集的大小
    """
    data_back = copy.deepcopy(data)
    for k in range(5, 105, 5):
        print('#' * 30)
        print("K=%d" % k)
        result, eval_result = mondrian(data, k, RELAX)
        result = covert_to_raw(result)
        data = copy.deepcopy(data_back)
        print("NCP %0.2f" % eval_result[0] + "%")
        print("Running time %0.2f" % eval_result[1] + " seconds")
Exemple #24
0
def get_result_k(data):
    """
    change k, while fixing QD and size of data set
    """
    data_back = copy.deepcopy(data)
    for k in range(5, 105, 5):
        print('#' * 30)
        print("K=%d" % k)
        result, eval_result = mondrian(data, k, RELAX)
        if DATA_SELECT == 'a':
            result = covert_to_raw(result)
        data = copy.deepcopy(data_back)
        print("NCP %0.2f" % eval_result[0] + "%")
        print("Running time %0.2f" % eval_result[1] + " seconds")
Exemple #25
0
 def test2_mondrian_relax(self):
     data = [[6, 1, 'haha'],
             [8, 1, 'haha'],
             [8, 1, 'test'],
             [8, 1, 'haha'],
             [8, 1, 'test'],
             [4, 1, 'hha'],
             [4, 2, 'hha'],
             [4, 3, 'hha'],
             [4, 4, 'hha']]
     result, eval_r = mondrian(data, 2, True)
     # print result
     # print eval_r
     self.assertTrue(abs(eval_r[0] - 700.0 / 54) < 0.05)
Exemple #26
0
def get_result_k(data):
    """
    change k, whle fixing QD and size of dataset
    """
    data_back = copy.deepcopy(data)
    # for k in [2, 5, 10, 25, 50, 100]:
    for k in range(5, 105, 5):
        print "#" * 30
        print "K=%d" % k
        result, eval_result = mondrian(data, k, RELAX)
        if DATA_SELECT == "a":
            result = covert_to_raw(result)
        data = copy.deepcopy(data_back)
        print "NCP %0.2f" % eval_result[0] + "%"
        print "Running time %0.2f" % eval_result[1] + " seconds"
Exemple #27
0
def get_result_qi(data, k=10):
    """
    change nubmber of QI, whle fixing k and size of dataset
    """
    data_back = copy.deepcopy(data)
    num_data = len(data[0])
    for i in reversed(range(1, num_data)):
        print '#' * 30
        print "Number of QI=%d" % i
        result, eval_result = mondrian(data, k, RELAX, i)
        if DATA_SELECT == 'a':
            result = covert_to_raw(result)
        data = copy.deepcopy(data_back)
        print "NCP %0.2f" % eval_result[0] + "%"
        print "Running time %0.2f" % eval_result[1] + " seconds"
def get_result_one(att_trees, data, k=DEFAULT_K):
    "run mondrian for one time, with k=10"
    print "K=%d" % k
    data_back = copy.deepcopy(data)
    missing_rate(data)
    _, eval_result = mondrian_delete_missing(att_trees, data, k)
    print "Mondrian"
    print "NCP %0.2f" % eval_result[0] + "%"
    print "Running time %0.2f" % eval_result[1] + "seconds"
    print "Missing Pollution = %.2f %%" % eval_result[2]
    data = copy.deepcopy(data_back)
    _, eval_result = mondrian(att_trees, data, k)
    print "Enhanced Mondrian"
    print "NCP %0.2f" % eval_result[0] + "%"
    print "Running time %0.2f" % eval_result[1] + "seconds"
    print "Missing Pollution = %.2f %%" % eval_result[2]
def get_result_one(att_trees, data, k=DEFAULT_K):
    "run mondrian for one time, with k=10"
    print "K=%d" % k
    data_back = copy.deepcopy(data)
    missing_rate(data)
    _, eval_result = mondrian_delete_missing(att_trees, data, k)
    print "Mondrian"
    print "NCP %0.2f" % eval_result[0] + "%"
    print "Running time %0.2f" % eval_result[1] + "seconds"
    print "Missing Pollution = %.2f %%" % eval_result[2]
    data = copy.deepcopy(data_back)
    _, eval_result = mondrian(att_trees, data, k)
    print "Enhanced Mondrian"
    print "NCP %0.2f" % eval_result[0] + "%"
    print "Running time %0.2f" % eval_result[1] + "seconds"
    print "Missing Pollution = %.2f %%" % eval_result[2]
Exemple #30
0
def get_result_one(data, k=10):
    """
    run mondrian for one time, with k=10
    """
    print("K=%d" % k)
    data_back = copy.deepcopy(data)
    result, eval_result = mondrian(data, k, RELAX)
    # Convert numerical values back to categorical values if necessary
    if DATA_SELECT == 'a':
        result = covert_to_raw(result)
    else:
        for r in result:
            r[-1] = ','.join(r[-1])
    # write to anonymized.out
    write_to_file(result)
    data = copy.deepcopy(data_back)
    print("NCP %0.2f" % eval_result[0] + "%")
    print("Running time %0.2f" % eval_result[1] + " seconds")
Exemple #31
0
def get_result_one(data, k=10):  #设置K的值
    """
    run mondrian for one time, with k=10
    设置K度为10,调用加密函数进行一次加密
    """
    print("K=%d" % k)
    data_back = copy.deepcopy(data)
    result, eval_result = mondrian(data, k, RELAX)
    # Convert numerical values back to categorical values if necessary
    result = covert_to_raw(result)
    # write to anonymized.out
    write_to_file(result)
    data = copy.deepcopy(data_back)
    print("NCP %0.2f" % eval_result[0] + "%")
    print('内存使用:',
          (psutil.Process(os.getpid()).memory_full_info()).uss / 1024. / 1024.,
          'MB')
    print("Running time %0.2f" % eval_result[1] + " seconds")
Exemple #32
0
def get_result_one(data, k=10):
    """
    run mondrian for one time, with k=10
    """
    print("K=%d" % k)
    data_back = copy.deepcopy(data)
    result, eval_result = mondrian(data, k, RELAX)
    # Convert numerical values back to categorical values if necessary
    if DATA_SELECT == 'a':
        result = covert_to_raw(result)
    else:
        for r in result:
            r[-1] = ','.join(r[-1])
    # write to anonymized.out
    write_to_file(result)
    data = copy.deepcopy(data_back)
    print("NCP %0.2f" % eval_result[0] + "%")
    print("Running time %0.2f" % eval_result[1] + " seconds")
Exemple #33
0
def KAnonymity(k, dataT):
    """
    k: k parameter for k-anonymity
    This function takes use of Mondrian function that implemented here 
    https://github.com/qiyuangong/Mondrian
    """
    IsRelaxedMondrian = False  #Decide to relax or not

    dataT = DOB_To_Year(dataT)
    dataT = Perturb(dataT)

    listOfDict = []
    reverseDict = []
    newdata = []
    for i in range(len(Quasi)):
        originalList = list(dataT[i])
        originalList.sort(key=Counter(originalList).get, reverse=True)
        rankedAttr = list(unique_everseen(originalList))
        listOfDict.append(dict(zip(rankedAttr, range(len(rankedAttr)))))
        reverseDict.append(dict(zip(range(len(rankedAttr)), rankedAttr)))
        newdata.append([listOfDict[i][x] for x in dataT[i]])
    for i in range(len(Quasi), len(attrList)):
        newdata.append(dataT[i])
    #converting quasi-identifiers to numbers
    result = mondrian(map(list, zip(*newdata)),
                      k,
                      IsRelaxedMondrian,
                      QI_num=len(Quasi))[0]

    resultT = Transpose(result)
    for i in range(len(Quasi)):
        translator = reverseDict[i]
        for j in range(len(resultT[i])):
            row = resultT[i][j]
            if type(row) == int:
                dataT[i][j] = translator[row]
            else:
                dataT[i][j] = [translator[x] for x in row]

    for i in range(len(Quasi), len(attrList)):
        for j in range(len(resultT[i])):
            dataT[i][j] = resultT[i][j]

    return dataT
Exemple #34
0
def get_result_qi(att_trees, data, k=DEFAULT_K):
    """
    change nubmber of QI, whle fixing k and size of dataset
    """
    data_back = copy.deepcopy(data)
    ls = len(data[0])
    all_ncp = []
    all_rtime = []
    for i in range(1, ls):
        print '#' * 30
        print "Number of QI=%d" % i
        _, eval_result = mondrian(att_trees, data, k, i)
        data = copy.deepcopy(data_back)
        print "NCP %0.2f" % eval_result[0] + "%"
        all_ncp.append(round(eval_result[0], 2))
        print "Running time %0.2f" % eval_result[1] + "seconds"
        all_rtime.append(round(eval_result[1], 2))
    print "All NCP", all_ncp
    print "All Running time", all_rtime
Exemple #35
0
def get_result_qi(att_trees, data, k=DEFAULT_K):
    """
    change nubmber of QI, whle fixing k and size of dataset
    """
    data_back = copy.deepcopy(data)
    ls = len(data[0])
    all_ncp = []
    all_rtime = []
    for i in range(1, ls):
        print '#' * 30
        print "Number of QI=%d" % i
        _, eval_result = mondrian(att_trees, data, k, i)
        data = copy.deepcopy(data_back)
        print "NCP %0.2f" % eval_result[0] + "%"
        all_ncp.append(round(eval_result[0], 2))
        print "Running time %0.2f" % eval_result[1] + "seconds"
        all_rtime.append(round(eval_result[1], 2))
    print "All NCP", all_ncp
    print "All Running time", all_rtime
Exemple #36
0
def get_result_k(att_trees, data):
    """
    change k, whle fixing QD and size of dataset
    """
    data_back = copy.deepcopy(data)
    all_ncp = []
    all_rtime = []
    # for k in range(5, 105, 5):
    for k in [2, 5, 10, 25, 50, 100]:
        print '#' * 30
        print "K=%d" % k
        print "Mondrian"
        _, eval_result = mondrian(att_trees, data, k)
        data = copy.deepcopy(data_back)
        print "NCP %0.2f" % eval_result[0] + "%"
        all_ncp.append(round(eval_result[0], 2))
        print "Running time %0.2f" % eval_result[1] + "seconds"
        all_rtime.append(round(eval_result[1], 2))
    print "All NCP", all_ncp
    print "All Running time", all_rtime
Exemple #37
0
def get_result_k(att_trees, data):
    """
    change k, whle fixing QD and size of dataset
    """
    data_back = copy.deepcopy(data)
    all_ncp = []
    all_rtime = []
    # for k in range(5, 105, 5):
    for k in [2, 5, 10, 25, 50, 100]:
        print '#' * 30
        print "K=%d" % k
        print "Mondrian"
        _, eval_result = mondrian(att_trees, data, k)
        data = copy.deepcopy(data_back)
        print "NCP %0.2f" % eval_result[0] + "%"
        all_ncp.append(round(eval_result[0], 2))
        print "Running time %0.2f" % eval_result[1] + "seconds"
        all_rtime.append(round(eval_result[1], 2))
    print "All NCP", all_ncp
    print "All Running time", all_rtime
def get_result_qi(att_trees, data, k=DEFAULT_K):
    """
    change nubmber of QI, whle fixing k and size of dataset
    """
    data_back = copy.deepcopy(data)
    ls = len(data[0])
    all_ncp = []
    all_rtime = []
    all_pollution = []
    deletion_all_ncp = []
    deletion_all_rtime = []
    for i in range(1, ls):
        if __DEBUG:
            print '#' * 30
            print "Number of QI=%d" % i
            print "Enhanced Mondrian"
        _, eval_result = mondrian(att_trees, data, k, i)
        data = copy.deepcopy(data_back)
        if __DEBUG:
            print "NCP %0.2f" % eval_result[0] + "%"
            print "Running time %0.2f" % eval_result[1] + "seconds"
            print "Missing Pollution = %.2f %%" % eval_result[2] + "%"
            print "Mondrian"
        all_ncp.append(round(eval_result[0], 2))
        all_rtime.append(round(eval_result[1], 2))
        all_pollution.append(round(eval_result[2], 2))
        _, eval_result = mondrian_delete_missing(att_trees, data, k, i)
        data = copy.deepcopy(data_back)
        if __DEBUG:
            print "NCP %0.2f" % eval_result[0] + "%"
            print "Running time %0.2f" % eval_result[1] + "seconds"
            print "Mondrian"
        deletion_all_ncp.append(round(eval_result[0], 2))
        deletion_all_rtime.append(round(eval_result[1], 2))
    print "Mondrian"
    print "All NCP", deletion_all_ncp
    print "All Running time", deletion_all_rtime
    print "Enhanced Mondrian"
    print "All NCP", all_ncp
    print "All Running time", all_rtime
    print "Missing Pollution", all_pollution
def get_result_qi(att_trees, data, k=DEFAULT_K):
    """
    change nubmber of QI, whle fixing k and size of dataset
    """
    data_back = copy.deepcopy(data)
    ls = len(data[0])
    all_ncp = []
    all_rtime = []
    all_pollution = []
    deletion_all_ncp = []
    deletion_all_rtime = []
    for i in range(1, ls):
        if __DEBUG:
            print '#' * 30
            print "Number of QI=%d" % i
            print "Enhanced Mondrian"
        _, eval_result = mondrian(att_trees, data, k, i)
        data = copy.deepcopy(data_back)
        if __DEBUG:
            print "NCP %0.2f" % eval_result[0] + "%"
            print "Running time %0.2f" % eval_result[1] + "seconds"
            print "Missing Pollution = %.2f %%" % eval_result[2] + "%"
            print "Mondrian"
        all_ncp.append(round(eval_result[0], 2))
        all_rtime.append(round(eval_result[1], 2))
        all_pollution.append(round(eval_result[2], 2))
        _, eval_result = mondrian_delete_missing(att_trees, data, k, i)
        data = copy.deepcopy(data_back)
        if __DEBUG:
            print "NCP %0.2f" % eval_result[0] + "%"
            print "Running time %0.2f" % eval_result[1] + "seconds"
            print "Mondrian"
        deletion_all_ncp.append(round(eval_result[0], 2))
        deletion_all_rtime.append(round(eval_result[1], 2))
    print "Mondrian"
    print "All NCP", deletion_all_ncp
    print "All Running time", deletion_all_rtime
    print "Enhanced Mondrian"
    print "All NCP", all_ncp
    print "All Running time", all_rtime
    print "Missing Pollution", all_pollution
def get_result_k(att_trees, data):
    """
    change k, whle fixing QD and size of dataset
    """
    data_back = copy.deepcopy(data)
    all_ncp = []
    all_rtime = []
    all_pollution = []
    deletion_all_ncp = []
    deletion_all_rtime = []
    # for k in range(5, 105, 5):
    for k in [2, 5, 10, 25, 50, 100]:
        if __DEBUG:
            print '#' * 30
            print "K=%d" % k
            print "Enhanced Mondrian"
        _, eval_result = mondrian(att_trees, data, k)
        data = copy.deepcopy(data_back)
        all_ncp.append(round(eval_result[0], 2))
        all_rtime.append(round(eval_result[1], 2))
        all_pollution.append(round(eval_result[2], 2))
        if __DEBUG:
            print "NCP %0.2f" % eval_result[0] + "%"
            print "Running time %0.2f" % eval_result[1] + "seconds"
            print "Missing Pollution = %.2f %%" % eval_result[2]
            print "Mondrian"
        _, eval_result = mondrian_delete_missing(att_trees, data, k)
        data = copy.deepcopy(data_back)
        if __DEBUG:
            print "NCP %0.2f" % eval_result[0] + "%"
            print "Running time %0.2f" % eval_result[1] + "seconds"
        deletion_all_ncp.append(round(eval_result[0], 2))
        deletion_all_rtime.append(round(eval_result[1], 2))
    print "Mondrian"
    print "All NCP", deletion_all_ncp
    print "All Running time", deletion_all_rtime
    print "Enhanced Mondrian"
    print "All NCP", all_ncp
    print "All Running time", all_rtime
    print "Missing Pollution", all_pollution
def get_result_k(att_trees, data):
    """
    change k, whle fixing QD and size of dataset
    """
    data_back = copy.deepcopy(data)
    all_ncp = []
    all_rtime = []
    all_pollution = []
    deletion_all_ncp = []
    deletion_all_rtime = []
    # for k in range(5, 105, 5):
    for k in [2, 5, 10, 25, 50, 100]:
        if __DEBUG:
            print '#' * 30
            print "K=%d" % k
            print "Enhanced Mondrian"
        _, eval_result = mondrian(att_trees, data, k)
        data = copy.deepcopy(data_back)
        all_ncp.append(round(eval_result[0], 2))
        all_rtime.append(round(eval_result[1], 2))
        all_pollution.append(round(eval_result[2], 2))
        if __DEBUG:
            print "NCP %0.2f" % eval_result[0] + "%"
            print "Running time %0.2f" % eval_result[1] + "seconds"
            print "Missing Pollution = %.2f %%" % eval_result[2]
            print "Mondrian"
        _, eval_result = mondrian_delete_missing(att_trees, data, k)
        data = copy.deepcopy(data_back)
        if __DEBUG:
            print "NCP %0.2f" % eval_result[0] + "%"
            print "Running time %0.2f" % eval_result[1] + "seconds"
        deletion_all_ncp.append(round(eval_result[0], 2))
        deletion_all_rtime.append(round(eval_result[1], 2))
    print "Mondrian"
    print "All NCP", deletion_all_ncp
    print "All Running time", deletion_all_rtime
    print "Enhanced Mondrian"
    print "All NCP", all_ncp
    print "All Running time", all_rtime
    print "Missing Pollution", all_pollution
Exemple #42
0
def get_result_dataset(att_trees, data, k=DEFAULT_K, n=10):
    """
    fix k and QI, while changing size of dataset
    n is the proportion nubmber.
    """
    data_back = copy.deepcopy(data)
    length = len(data_back)
    print "K=%d" % k
    joint = 5000
    datasets = []
    check_time = length / joint
    if length % joint == 0:
        check_time -= 1
    for i in range(check_time):
        datasets.append(joint * (i + 1))
    datasets.append(length)
    all_ncp = []
    all_rtime = []
    for pos in datasets:
        ncp = rtime = 0
        print '#' * 30
        print "size of dataset %d" % pos
        for j in range(n):
            temp = random.sample(data, pos)
            result, eval_result = mondrian(att_trees, temp, k)
            ncp += eval_result[0]
            rtime += eval_result[1]
            data = copy.deepcopy(data_back)
        ncp /= n
        rtime /= n
        print "Average NCP %0.2f" % ncp + "%"
        all_ncp.append(round(ncp, 2))
        print "Running time %0.2f" % rtime + "seconds"
        all_rtime.append(round(rtime, 2))
    print '#' * 30
    print "All NCP", all_ncp
    print "All Running time", all_rtime
Exemple #43
0
def get_result_dataset(att_trees, data, k=DEFAULT_K, n=10):
    """
    fix k and QI, while changing size of dataset
    n is the proportion nubmber.
    """
    data_back = copy.deepcopy(data)
    length = len(data_back)
    print "K=%d" % k
    joint = 5000
    datasets = []
    check_time = length / joint
    if length % joint == 0:
        check_time -= 1
    for i in range(check_time):
        datasets.append(joint * (i + 1))
    datasets.append(length)
    all_ncp = []
    all_rtime = []
    for pos in datasets:
        ncp = rtime = 0
        print '#' * 30
        print "size of dataset %d" % pos
        for j in range(n):
            temp = random.sample(data, pos)
            result, eval_result = mondrian(att_trees, temp, k)
            ncp += eval_result[0]
            rtime += eval_result[1]
            data = copy.deepcopy(data_back)
        ncp /= n
        rtime /= n
        print "Average NCP %0.2f" % ncp + "%"
        all_ncp.append(round(ncp, 2))
        print "Running time %0.2f" % rtime + "seconds"
        all_rtime.append(round(rtime, 2))
    print '#' * 30
    print "All NCP", all_ncp
    print "All Running time", all_rtime
Exemple #44
0
 def test2_mondrian_strict(self):
     data = [[6, 1, 'haha'], [8, 1, 'haha'], [8, 1, 'test'], [8, 1, 'haha'],
             [8, 1, 'test'], [4, 1, 'hha'], [4, 2, 'hha'], [4, 3, 'hha'],
             [4, 4, 'hha']]
     result, eval_r = mondrian(data, 2, False)
     self.assertTrue(abs(eval_r[0] - 2300.0 / 108) < 0.05)
Exemple #45
0
 def test1_mondrian_relax(self):
     data = [[6, 1, 'haha'], [6, 1, 'test'], [8, 2, 'haha'], [8, 2, 'test'],
             [4, 1, 'hha'], [4, 2, 'hha'], [4, 3, 'hha'], [4, 4, 'hha']]
     result, eval_r = mondrian(data, 2, True)
     self.assertTrue(abs(eval_r[0] - 100.0 / 12) < 0.05)
def get_result_dataset(att_trees, data, k=DEFAULT_K, n=10):
    """
    fix k and QI, while changing size of dataset
    n is the proportion nubmber.
    """
    data_back = copy.deepcopy(data)
    length = len(data_back)
    print "K=%d" % k
    joint = 5000
    datasets = []
    check_time = length / joint
    if length % joint == 0:
        check_time -= 1
    for i in range(check_time):
        datasets.append(joint * (i + 1))
    datasets.append(length)
    all_ncp = []
    all_rtime = []
    all_pollution = []
    deletion_all_ncp = []
    deletion_all_rtime = []
    for pos in datasets:
        ncp = rtime = pollution = 0
        if __DEBUG:
            print '#' * 30
            print "size of dataset %d" % pos
            print "Enhanced Mondrian"
        for j in range(n):
            temp = random.sample(data, pos)
            result, eval_result = mondrian(att_trees, temp, k)
            ncp += eval_result[0]
            rtime += eval_result[1]
            pollution += eval_result[2]
            data = copy.deepcopy(data_back)
            # save_to_file((att_trees, temp, result, k, L))
        ncp /= n
        rtime /= n
        pollution /= n
        if __DEBUG:
            print "Average NCP %0.2f" % ncp + "%"
            print "Running time %0.2f" % rtime + "seconds"
            print "Missing Pollution = %.2f %%" % pollution + "%"
            print "Mondrian"
        all_ncp.append(round(ncp, 2))
        all_rtime.append(round(rtime, 2))
        all_pollution.append(round(pollution, 2))
        ncp = rtime = 0
        for j in range(n):
            temp = random.sample(data, pos)
            result, eval_result = mondrian_delete_missing(att_trees, temp, k)
            ncp += eval_result[0]
            rtime += eval_result[1]
            data = copy.deepcopy(data_back)
        ncp /= n
        rtime /= n
        if __DEBUG:
            print "Average NCP %0.2f" % ncp + "%"
            print "Running time %0.2f" % rtime + "seconds"
        deletion_all_ncp.append(round(ncp, 2))
        deletion_all_rtime.append(round(rtime, 2))
    print "Mondrian"
    print "All NCP", deletion_all_ncp
    print "All Running time", deletion_all_rtime
    print "Enhanced Mondrian"
    print "All NCP", all_ncp
    print "All Running time", all_rtime
    print "Missing Pollution", all_pollution
def get_result_missing(att_trees, data, k=DEFAULT_K, n=DEFAULT_K):
    """
    change nubmber of missing, whle fixing k, qi and size of dataset
    """
    data_back = copy.deepcopy(data)
    length = len(data_back)
    qi_len = len(data[0]) - 1
    raw_missing = raw_missing_record = 0
    print "K=%d" % k
    for record in data:
        flag = False
        for value in record:
            if value == '*':
                raw_missing += 1
                flag = True
        if flag:
            raw_missing_record += 1
    # print "Missing Percentage %.2f" % (raw_missing * 100.0 / (length * qi_len)) + '%%'
    # each evaluation varies add 5% missing values
    check_percentage = [5, 10, 25, 50, 75]
    datasets = []
    for p in check_percentage:
        joint = int(0.01 * p * length * qi_len) - raw_missing
        datasets.append(joint)
    all_ncp = []
    all_rtime = []
    all_pollution = []
    deletion_all_ncp = []
    deletion_all_rtime = []
    for i, joint in enumerate(datasets):
        ncp = rtime = pollution = 0.0
        for j in range(n):
            gen_missing_dataset(data, joint)
            if __DEBUG:
                missing_rate(data)
            _, eval_result = mondrian(att_trees, data, k)
            data = copy.deepcopy(data_back)
            ncp += eval_result[0]
            rtime += eval_result[1]
            pollution += eval_result[2]
        ncp /= n
        rtime /= n
        pollution /= n
        if __DEBUG:
            print "check_percentage", check_percentage[i]
            print "Add missing %d" % joint
            print "Average NCP %0.2f" % ncp + "%"
            print "Running time %0.2f" % rtime + "seconds"
            print "Missing Pollution = %.2f" % pollution + "%"
            print '#' * 30
        all_ncp.append(round(ncp, 2))
        all_rtime.append(round(rtime, 2))
        all_pollution.append(round(pollution, 2))
        ncp = rtime = pollution = 0.0
        for j in range(n):
            gen_missing_dataset(data, joint)
            if __DEBUG:
                missing_rate(data)
            _, eval_result = mondrian_delete_missing(att_trees, data, k)
            data = copy.deepcopy(data_back)
            ncp += eval_result[0]
            rtime += eval_result[1]
        ncp /= n
        rtime /= n
        if __DEBUG:
            print "Add missing %d" % joint
            print "Average NCP %0.2f" % ncp + "%"
            print "Running time %0.2f" % rtime + "seconds"
            print "Missing Pollution = %.2f" % pollution + "%"
            print '#' * 30
        deletion_all_ncp.append(round(ncp, 2))
        deletion_all_rtime.append(round(rtime, 2))
    print "Mondrian"
    print "All NCP", deletion_all_ncp
    print "All Running time", deletion_all_rtime
    print "Enhanced Mondrian"
    print "All NCP", all_ncp
    print "All Running time", all_rtime
    print "Missing Pollution", all_pollution
    print '#' * 30
Exemple #48
0
 def test_read_csv_and_anonymise(self):
     from utils.read_adult_data import read_data as read_adult
     DATA, INTUITIVE_ORDER = read_adult()
     result, eval_result = mondrian(DATA, 40, False)
     print(result)
Exemple #49
0
def get_result_one(data, mylist, k, p):

    result, eval_result = mondrian(data, mylist, k, p)

    return (result, eval_result)