def m_generalization(att_trees, data, k=10, l=5): """Using partition_for_transaction to anonymize SA (transaction) partition, while applying anatomy to separate QID and SA return (result, eval_result) result is 2-dimensional list eval_result is a tuple (rncp, tncp, rtime) """ global ATT_TREES, DATA ATT_TREES = att_trees DATA = data start_time = time.time() if _DEBUG: print "size of dataset %d" % len(data) result = [] trans = [t[-1] for t in data] trans_set, tncp = partition(att_trees[-1], trans, k) partition_data = [] for ttemp in trans_set: (index_list, tran_value) = ttemp for t in index_list: DATA[t][-1] = tran_value[:] partition_data.append(DATA[t][:]) if _DEBUG: print "Begin Mondrian" result, rncp = mondrian_l_diversity(ATT_TREES, partition_data, l) rtime = float(time.time() - start_time) if _DEBUG: print "Total running time = %.2f seconds" % rtime # transform data format (QID1,.., QIDn, SA set, GroupID, 1/|group size|, Group SA domain) # 1/|group size|, Group SA domain will be used in evaluation return (result, (rncp, tncp, rtime))
def Separation_Gen(att_trees, data, k=10, l=5): """Using partition_for_transaction to anonymize SA (transaction) partition, while applying anatomy to separate QID and SA return (result, eval_result) result is 2-dimensional list eval_result is a tuple (rncp, tncp, rtime) """ global ATT_TREES, DATA ATT_TREES = att_trees DATA = data start_time = time.time() if _DEBUG: print "size of dataset %d" % len(data) result = [] # copy transaction part of data to trans trans = [t[-1] for t in data] # anonymize transaction part with partition algorithm trans_set, sa_ncp = partition(att_trees[-1], trans, k) partition_data = [] for ttemp in trans_set: (index_list, tran_value) = ttemp for t in index_list: DATA[t][-1] = tran_value[:] partition_data.append(DATA[t][:]) if _DEBUG: print "Begin Mondrian" # anonymize qid and sa part with mondrian_l_diversity result, qid_ncp = mondrian_l_diversity(ATT_TREES, partition_data, l) rtime = float(time.time() - start_time) if _DEBUG: print "Total running time = %.2f seconds" % rtime # transform data format (QID1,.., QIDn, SA set, GroupID, 1/|group size|, Group SA domain) # 1/|group size|, Group SA domain will be used in evaluation return (result, (qid_ncp, sa_ncp, rtime))
def get_result_dataset(att_tree, data, k=10, num_test=10): """ fix k, while changing size of dataset num_test is the test nubmber. """ data_back = copy.deepcopy(data) length = len(data_back) joint = 5000 dataset_num = length / joint if length % joint == 0: dataset_num += 1 for i in range(1, dataset_num + 1): pos = i * joint ncp = rtime = 0 if pos > length: continue print '#' * 30 print "size of dataset %d" % pos for j in range(num_test): temp = random.sample(data, pos) _, eval_result = partition(att_tree, temp, k) ncp += eval_result[0] rtime += eval_result[1] data = copy.deepcopy(data_back) ncp /= num_test rtime /= num_test print "Average NCP %0.2f" % ncp + "%" print "Running time %0.2f" % rtime + " seconds" print '#' * 30
def get_result_one(att_tree, data, k=10): """ run partition for one time, with k=10 """ print "K=%d" % k _, eval_result = partition(att_tree, data, k) print "NCP %0.2f" % eval_result[0] + "%" print "Running time %0.2f" % eval_result[1] + " seconds"
def get_result_k(att_tree, data): """ change k, whle fixing size of dataset """ data_back = copy.deepcopy(data) # for k in range(5, 105, 5): for k in [2, 5, 10, 25, 50, 100]: print '#' * 30 print "K=%d" % k result, eval_result = partition(att_tree, data, k) data = copy.deepcopy(data_back) print "NCP %0.2f" % eval_result[0] + "%" print "Running time %0.2f" % eval_result[1] + " seconds"
def test_case_from_paper(self): init_tree() trans = [ ["a1"], ["a1", "a2"], ["b1", "b2"], ["b1", "b2"], ["a1", "a2", "b2"], ["a1", "a2", "b2"], ["a1", "a2", "b1", "b2"], ] result, _ = partition(ATT_TREE, trans, 2) for i, t in enumerate(result[:]): result[i] = list_to_str(t) self.assertEqual(set(result), set(["A", "A", "a1;a2;B", "a1;a2;B", "a1;a2;B", "b1;b2", "b1;b2"]))
def PAA(att_tree, data, K=10, L=5): """Using Partition to anonymize SA (transaction) partition, while applying Anatomize to separate QID and SA """ global gl_att_tree, gl_data gl_att_tree = att_tree gl_data = data start_time = time.time() tran_tree = {} print "size of dataset %d" % len(gl_data) result = [] trans = [t[-1] for t in gl_data] trans_set = partition(att_tree, trans, K) grouped_data = [] for ttemp in trans_set: (index_list, tran_value) = ttemp parent = list_to_str(tran_value, cmp) try: tran_tree[parent] except: tran_tree[parent] = set() gtemp = [] for t in index_list: temp = gl_data[t][:] leaf = list_to_str(temp[-1], cmp) tran_tree[parent].add(leaf) temp[-1] = tran_value[:] gtemp.append(temp) grouped_data.append(gtemp) print "Begin Anatomy" grouped_result = anatomizer(grouped_data, L) print("--- %s seconds ---" % (time.time()-start_time)) # transform data format (QID1,.., QIDn, SA set, GroupID, 1/|group size|, SA_list (dict) :original SA (str) sets with prob) # 1/|group size|, original SA sets with prob (dict) will be used in evaluation for index, group in enumerate(grouped_result): length = len(group) leaf_list = [] SA_list = {} parent_list = {} for t in group: parent = list_to_str(t[-1], cmp) gen_range = get_range(att_tree, t[-1]) leaf_list = leaf_list + list(tran_tree[parent]) parent_list[parent] = gen_range # all transactions covered by this group leaf_list = list(set(leaf_list)) # pdb.set_trace() for temp in leaf_list: for p in parent_list.keys(): if temp in tran_tree[p]: try: SA_list[temp] += parent_list[p]/length except: SA_list[temp] = parent_list[p]/length # pdb.set_trace() for t in group: temp = t[:] temp.append(index) temp.append(1.0/length) temp.append(SA_list) result.append(temp) return result
def APA(att_tree, data, K=10, L=5): """Using Partition to anonymize SA (transaction) partition, while applying Anatomizer to separate QID and SA """ # Initialization global gl_att_tree, gl_data gl_att_tree = att_tree gl_data = data start_time = time.time() result = [] suppress = [] tran_tree = {} print "size of dataset %d" % len(gl_data) # Begin Anatomy print "Begin Anatomy" anatomy_index = anatomizer(gl_data, L) # Begin Partition trans = [t[-1] for t in gl_data] trans_set = partition(att_tree, trans, K) for ttemp in trans_set: (index_list, tran_value) = ttemp parent = list_to_str(tran_value, cmp) try: tran_tree[parent] except: tran_tree[parent] = set() for t in index_list: leaf = list_to_str(gl_data[t][-1], cmp) tran_tree[parent].add(leaf) gl_data[t][-1] = tran_value[:] # pdb.set_trace() # Merge groups to achieve l-diversity residue = [] grouped_index = [] for group in anatomy_index: if check_diversity(group, L): grouped_index.append(group[:]) else: residue.append(group[:]) while len(residue) > 0: g = residue.pop() for index, group in enumerate(residue): if mergeable(g, group, L): g = g + group grouped_index.append(g) residue.pop(index) break else: # add group element to random group, which alread satisfy l-diversity if len(grouped_index) > 0: seed = random.randrange(len(grouped_index)) grouped_index[seed] = grouped_index[seed] + g else: print "Error: group cannot satisfy l-diversity" for index in g: suppress.append(gl_data[index]) if _DEBUG: print 'NO. of Suppress after Group Merge = %d' % len(suppress) print 'NO. of groups = %d' % len(grouped_index) grouped_result = [] for indexes in grouped_index: gtemp = [] for index in indexes: gtemp.append(gl_data[index]) grouped_result.append(gtemp) print("--- %s seconds ---" % (time.time()-start_time)) # transform data format (QID1,.., QIDn, SA set, GroupID, 1/|group size|, SA_list (dict) :original SA (str) sets with prob) # 1/|group size|, original SA sets with prob (dict) will be used in evaluation for index, group in enumerate(grouped_result): length = len(group) leaf_list = [] SA_list = {} parent_list = {} for t in group: parent = list_to_str(t[-1], cmp) gen_range = get_range(att_tree, t[-1]) leaf_list = leaf_list + list(tran_tree[parent]) parent_list[parent] = gen_range # all transactions covered by this group leaf_list = list(set(leaf_list)) # pdb.set_trace() for temp in leaf_list: for p in parent_list.keys(): if temp in tran_tree[p]: try: SA_list[temp] += parent_list[p]/length except: SA_list[temp] = parent_list[p]/length # pdb.set_trace() for t in group: temp = t[:] temp.append(index) temp.append(1.0/length) temp.append(SA_list) result.append(temp) return result
def APA(att_tree, data, K=10, L=5): """Using Partition to anonymize SA (transaction) partition, while applying Anatomizer to separate QID and SA """ # Initialization global gl_att_tree, gl_data gl_att_tree = att_tree gl_data = data start_time = time.time() result = [] suppress = [] tran_tree = {} print "size of dataset %d" % len(gl_data) # Begin Anatomy print "Begin Anatomy" anatomy_index = anatomizer(gl_data, L) # Begin Partition trans = [t[-1] for t in gl_data] trans_set = partition(att_tree, trans, K) for ttemp in trans_set: (index_list, tran_value) = ttemp parent = list_to_str(tran_value, cmp) try: tran_tree[parent] except: tran_tree[parent] = set() for t in index_list: leaf = list_to_str(gl_data[t][-1], cmp) tran_tree[parent].add(leaf) gl_data[t][-1] = tran_value[:] # pdb.set_trace() # Merge groups to achieve l-diversity residue = [] grouped_index = [] for group in anatomy_index: if check_diversity(group, L): grouped_index.append(group[:]) else: residue.append(group[:]) while len(residue) > 0: g = residue.pop() for index, group in enumerate(residue): if mergeable(g, group, L): g = g + group grouped_index.append(g) residue.pop(index) break else: # add group element to random group, which alread satisfy l-diversity if len(grouped_index) > 0: seed = random.randrange(len(grouped_index)) grouped_index[seed] = grouped_index[seed] + g else: print "Error: group cannot satisfy l-diversity" for index in g: suppress.append(gl_data[index]) if _DEBUG: print 'NO. of Suppress after Group Merge = %d' % len(suppress) print 'NO. of groups = %d' % len(grouped_index) grouped_result = [] for indexes in grouped_index: gtemp = [] for index in indexes: gtemp.append(gl_data[index]) grouped_result.append(gtemp) print("--- %s seconds ---" % (time.time() - start_time)) # transform data format (QID1,.., QIDn, SA set, GroupID, 1/|group size|, SA_list (dict) :original SA (str) sets with prob) # 1/|group size|, original SA sets with prob (dict) will be used in evaluation for index, group in enumerate(grouped_result): length = len(group) leaf_list = [] SA_list = {} parent_list = {} for t in group: parent = list_to_str(t[-1], cmp) gen_range = get_range(att_tree, t[-1]) leaf_list = leaf_list + list(tran_tree[parent]) parent_list[parent] = gen_range # all transactions covered by this group leaf_list = list(set(leaf_list)) # pdb.set_trace() for temp in leaf_list: for p in parent_list.keys(): if temp in tran_tree[p]: try: SA_list[temp] += parent_list[p] / length except: SA_list[temp] = parent_list[p] / length # pdb.set_trace() for t in group: temp = t[:] temp.append(index) temp.append(1.0 / length) temp.append(SA_list) result.append(temp) return result