예제 #1
0
파일: APA.py 프로젝트: zshwuhan/APA
def APA(att_tree, data, K=10, L=5):
    """Using Partition to anonymize SA (transaction) partition, 
    while applying Anatomizer to separate QID and SA
    """
    # Initialization
    global gl_att_tree, gl_data
    gl_att_tree = att_tree
    gl_data = data
    start_time = time.time()
    result = []
    suppress = []
    tran_tree = {}
    print "size of dataset %d" % len(gl_data)
    # Begin Anatomy
    print "Begin Anatomy"
    anatomy_index = anatomizer(gl_data, L)
    # Begin Partition
    trans = [t[-1] for t in gl_data]
    trans_set = partition(att_tree, trans, K)
    for ttemp in trans_set:
        (index_list, tran_value) = ttemp
        parent = list_to_str(tran_value, cmp)
        try:
            tran_tree[parent]
        except:
            tran_tree[parent] = set()
        for t in index_list:
            leaf = list_to_str(gl_data[t][-1], cmp)
            tran_tree[parent].add(leaf)
            gl_data[t][-1] = tran_value[:]
    # pdb.set_trace()
    # Merge groups to achieve l-diversity
    residue = []
    grouped_index = []
    for group in anatomy_index:
        if check_diversity(group, L):
            grouped_index.append(group[:])
        else:
            residue.append(group[:])
    while len(residue) > 0:
        g = residue.pop()
        for index, group in enumerate(residue):
            if mergeable(g, group, L):
                g = g + group
                grouped_index.append(g)
                residue.pop(index)
                break
        else:
            # add group element to random group, which alread satisfy l-diversity
            if len(grouped_index) > 0:
                seed = random.randrange(len(grouped_index))
                grouped_index[seed] = grouped_index[seed] + g
            else:
                print "Error: group cannot satisfy l-diversity"
                for index in g:
                    suppress.append(gl_data[index])
    if _DEBUG:
        print 'NO. of Suppress after Group Merge = %d' % len(suppress)
        print 'NO. of groups = %d' % len(grouped_index)
    grouped_result = []
    for indexes in grouped_index:
        gtemp = []
        for index in indexes:
            gtemp.append(gl_data[index])
        grouped_result.append(gtemp)
    print("--- %s seconds ---" % (time.time()-start_time))
    # transform data format (QID1,.., QIDn, SA set, GroupID, 1/|group size|, SA_list (dict) :original SA (str) sets with prob)
    # 1/|group size|, original SA sets with prob (dict) will be used in evaluation
    for index, group in enumerate(grouped_result):
        length = len(group)
        leaf_list = []
        SA_list = {}
        parent_list = {}
        for t in group:
            parent = list_to_str(t[-1], cmp)
            gen_range = get_range(att_tree, t[-1])
            leaf_list = leaf_list + list(tran_tree[parent])
            parent_list[parent] = gen_range
        # all transactions covered by this group
        leaf_list = list(set(leaf_list))
        # pdb.set_trace()
        for temp in leaf_list:
            for p in parent_list.keys():
                if temp in tran_tree[p]:
                    try:
                        SA_list[temp] += parent_list[p]/length 
                    except:
                        SA_list[temp] = parent_list[p]/length
        # pdb.set_trace()
        for t in group:
            temp = t[:]
            temp.append(index)
            temp.append(1.0/length)
            temp.append(SA_list)
            result.append(temp)
    return result
예제 #2
0
파일: PAA.py 프로젝트: zshwuhan/PAA
def PAA(att_tree, data, K=10, L=5):
    """Using Partition to anonymize SA (transaction) partition, 
    while applying Anatomize to separate QID and SA
    """
    global gl_att_tree, gl_data
    gl_att_tree = att_tree
    gl_data = data
    start_time = time.time()
    tran_tree = {}
    print "size of dataset %d" % len(gl_data)
    result = []
    trans = [t[-1] for t in gl_data]
    trans_set = partition(att_tree, trans, K)
    grouped_data = []
    for ttemp in trans_set:
        (index_list, tran_value) = ttemp
        parent = list_to_str(tran_value, cmp)
        try:
            tran_tree[parent]
        except:
            tran_tree[parent] = set()
        gtemp = []
        for t in index_list:
            temp = gl_data[t][:]
            leaf = list_to_str(temp[-1], cmp)
            tran_tree[parent].add(leaf)
            temp[-1] = tran_value[:]
            gtemp.append(temp)
        grouped_data.append(gtemp)
    print "Begin Anatomy"
    grouped_result = anatomizer(grouped_data, L)
    print("--- %s seconds ---" % (time.time()-start_time))
    # transform data format (QID1,.., QIDn, SA set, GroupID, 1/|group size|, SA_list (dict) :original SA (str) sets with prob)
    # 1/|group size|, original SA sets with prob (dict) will be used in evaluation
    for index, group in enumerate(grouped_result):
        length = len(group)
        leaf_list = []
        SA_list = {}
        parent_list = {}
        for t in group:
            parent = list_to_str(t[-1], cmp)
            gen_range = get_range(att_tree, t[-1])
            leaf_list = leaf_list + list(tran_tree[parent])
            parent_list[parent] = gen_range
        # all transactions covered by this group
        leaf_list = list(set(leaf_list))
        # pdb.set_trace()
        for temp in leaf_list:
            for p in parent_list.keys():
                if temp in tran_tree[p]:
                    try:
                        SA_list[temp] += parent_list[p]/length 
                    except:
                        SA_list[temp] = parent_list[p]/length
        # pdb.set_trace()
        for t in group:
            temp = t[:]
            temp.append(index)
            temp.append(1.0/length)
            temp.append(SA_list)
            result.append(temp)
    return result
예제 #3
0
def APA(att_tree, data, K=10, L=5):
    """Using Partition to anonymize SA (transaction) partition, 
    while applying Anatomizer to separate QID and SA
    """
    # Initialization
    global gl_att_tree, gl_data
    gl_att_tree = att_tree
    gl_data = data
    start_time = time.time()
    result = []
    suppress = []
    tran_tree = {}
    print "size of dataset %d" % len(gl_data)
    # Begin Anatomy
    print "Begin Anatomy"
    anatomy_index = anatomizer(gl_data, L)
    # Begin Partition
    trans = [t[-1] for t in gl_data]
    trans_set = partition(att_tree, trans, K)
    for ttemp in trans_set:
        (index_list, tran_value) = ttemp
        parent = list_to_str(tran_value, cmp)
        try:
            tran_tree[parent]
        except:
            tran_tree[parent] = set()
        for t in index_list:
            leaf = list_to_str(gl_data[t][-1], cmp)
            tran_tree[parent].add(leaf)
            gl_data[t][-1] = tran_value[:]
    # pdb.set_trace()
    # Merge groups to achieve l-diversity
    residue = []
    grouped_index = []
    for group in anatomy_index:
        if check_diversity(group, L):
            grouped_index.append(group[:])
        else:
            residue.append(group[:])
    while len(residue) > 0:
        g = residue.pop()
        for index, group in enumerate(residue):
            if mergeable(g, group, L):
                g = g + group
                grouped_index.append(g)
                residue.pop(index)
                break
        else:
            # add group element to random group, which alread satisfy l-diversity
            if len(grouped_index) > 0:
                seed = random.randrange(len(grouped_index))
                grouped_index[seed] = grouped_index[seed] + g
            else:
                print "Error: group cannot satisfy l-diversity"
                for index in g:
                    suppress.append(gl_data[index])
    if _DEBUG:
        print 'NO. of Suppress after Group Merge = %d' % len(suppress)
        print 'NO. of groups = %d' % len(grouped_index)
    grouped_result = []
    for indexes in grouped_index:
        gtemp = []
        for index in indexes:
            gtemp.append(gl_data[index])
        grouped_result.append(gtemp)
    print("--- %s seconds ---" % (time.time() - start_time))
    # transform data format (QID1,.., QIDn, SA set, GroupID, 1/|group size|, SA_list (dict) :original SA (str) sets with prob)
    # 1/|group size|, original SA sets with prob (dict) will be used in evaluation
    for index, group in enumerate(grouped_result):
        length = len(group)
        leaf_list = []
        SA_list = {}
        parent_list = {}
        for t in group:
            parent = list_to_str(t[-1], cmp)
            gen_range = get_range(att_tree, t[-1])
            leaf_list = leaf_list + list(tran_tree[parent])
            parent_list[parent] = gen_range
        # all transactions covered by this group
        leaf_list = list(set(leaf_list))
        # pdb.set_trace()
        for temp in leaf_list:
            for p in parent_list.keys():
                if temp in tran_tree[p]:
                    try:
                        SA_list[temp] += parent_list[p] / length
                    except:
                        SA_list[temp] = parent_list[p] / length
        # pdb.set_trace()
        for t in group:
            temp = t[:]
            temp.append(index)
            temp.append(1.0 / length)
            temp.append(SA_list)
            result.append(temp)
    return result