def check_L_diversity(partition): """check if partition satisfy l-diversity return True if satisfy, False if not. """ sa_dict = {} if len(partition) < GL_L: return False if isinstance(partition, Partition): records_set = partition.member else: records_set = partition num_record = len(records_set) for record in records_set: sa_value = list_to_str(record[-1]) try: sa_dict[sa_value] += 1 except KeyError: sa_dict[sa_value] = 1 if len(sa_dict.keys()) < GL_L: return False for sa in sa_dict.keys(): # if any SA value appear more than |T|/l, # the partition does not satisfy l-diversity if sa_dict[sa] > 1.0 * num_record / GL_L: return False return True
def build_SA_bucket(data): """ build SA buckets and a heap sorted by number of records in bucket """ buckets = {} bucket_heap = [] # Assign SA into buckets for record in data: if isinstance(data[0][-1], list): # rt data sa_value = list_to_str(record[-1]) else: # relational data sa_value = record[-1] try: buckets[sa_value].append(record) except KeyError: buckets[sa_value] = [record] # random shuffle records in buckets # make pop random for key in buckets.keys(): random.shuffle(buckets[key]) # group stage # each round choose l largest buckets, then pop # an element from these buckets to form a group # We use heap to sort buckets. for i, bucketed_record in enumerate(buckets.values()): # push to heap reversely length = len(bucketed_record) * -1 if length == 0: continue heapq.heappush(bucket_heap, (length, SABucket(bucketed_record, i))) return buckets, bucket_heap
def distribute_data(bucket, buckets, pick_value): """distribute records from parent_bucket to buckets (splited buckets) accroding to records elements. """ if len(buckets) == 0: print "Error: buckets is empty!" return data_index = bucket.member_index[:] for temp in data_index: gen_list = [] for t in DATA[temp]: treelist = PARENT_LIST[t] try: pos = treelist.index(pick_value) # if covered, then replaced with new value if pos > 0: gen_list.append(treelist[pos - 1]) else: print "Error: pick node is leaf, which cannot be splited" except: continue gen_list = list(set(gen_list)) # sort to ensure the order str_value = list_to_str(gen_list) try: buckets[str_value].member_index.append(temp) except: pdb.set_trace() print "Error: Cannot find key."
def NCP(mid): """Compute NCP (Normalized Certainty Penalty) when generate record to middle. """ ncp = 0.0 # exclude SA values(last one type []) list_key = list_to_str(mid) try: return NCP_CACHE[list_key] except KeyError: pass for i in range(QI_LEN): # if leaf_num of numerator is 1, then NCP is 0 width = 0.0 if IS_CAT[i] is False: try: float(mid[i]) except ValueError: temp = mid[i].split(',') width = float(temp[1]) - float(temp[0]) else: width = len(ATT_TREES[i][mid[i]]) * 1.0 width /= QI_RANGE[i] ncp += width NCP_CACHE[list_key] = ncp return ncp
def check_L_diversity(partition): """check if partition satisfy l-diversity return True if satisfy, False if not. """ sensitive_attribute_dict = {} if len(partition) < GLOBAL_L_VALUE: return False if isinstance(partition, Partition): records_set = partition.member else: records_set = partition number_of_records = len(records_set) for record in records_set: try: sensitive_attribute_value = list_to_str(record[-1]) except AttributeError: sensitive_attribute_value = record[-1] try: sensitive_attribute_dict[sensitive_attribute_value] += 1 except KeyError: sensitive_attribute_dict[sensitive_attribute_value] = 1 if len(sensitive_attribute_dict.keys()) < GLOBAL_L_VALUE: return False if len(sensitive_attribute_dict) >= GLOBAL_L_VALUE: return True return False
def balance_partitions(parent_bucket, buckets, K, pick_value): """handel buckets with less than K records """ global RESULT left_over = [] for k, t in buckets.items(): if len(t.member_index) < K: # add records of buckets with less than K elemnts # to left_over partition left_over.extend(t.member_index[:]) del buckets[k] if len(left_over) == 0: # left over bucket is empty, skip balance step return # re-distribute transactions with least information gain from # buckets over k to left_over, to enshure number of # records in left_over is larger than K # using flag to denote if re-distribute is successful or not flag = True while len(left_over) < K: # each iterator pick least information gain transaction from buckets over K check_list = [t for t in buckets.values() if len(t.member_index) > K] if len(check_list) == 0: flag = False break min_ig = 10000000000000000 min_key = (0, 0) for i, temp in enumerate(check_list): for j, t in enumerate(temp.member_index): ig = trans_information_gain(DATA[t], pick_value) if ig < min_ig: min_ig = ig min_key = (i, j) left_over.append(check_list[min_key[0]].member_index[min_key[1]]) del check_list[min_key[0]].member_index[min_key[1]] if flag is not True: # Note: if flag == False, means that split is unsuccessful. # So we need to pop a bucket from buckets to merge with left_over # The bucket poped is larger than K, so left over will larger than K parent_bucket.splitable = False try: min_ig = 10000000000000000 min_key = "" for k, t in buckets.items(): ig = information_gain(t, pick_value) if ig < min_ig: min_ig = ig min_key = k left_over.extend(buckets[min_key].member_index[:]) del buckets[min_key] except: print "Error: buckets is empty" pdb.set_trace() parent_bucket.member_index = left_over[:] str_value = list_to_str(parent_bucket.value) buckets[str_value] = parent_bucket
def balance_partitions(parent_bucket, buckets, K, pick_value): """handel buckets with less than K records """ global RESULT left_over = [] for k, t in buckets.items(): if len(t.member_index) < K: # add records of buckets with less than K elemnts # to left_over partition left_over.extend(t.member_index[:]) del buckets[k] if len(left_over) == 0: # left over bucket is empty, skip balance step return # re-distribute transactions with least information gain from # buckets over k to left_over, to enshure number of # records in left_over is larger than K # using flag to denote if re-distribute is successful or not flag = True while len(left_over) < K: # each iterator pick least information gain transaction from buckets over K check_list = [t for t in buckets.values() if len(t.member_index) > K] if len(check_list) == 0: flag = False break min_ig = 10000000000000000 min_key = (0, 0) for i, temp in enumerate(check_list): for j, t in enumerate(temp.member_index): ig = trans_information_gain(DATA[t], pick_value) if ig < min_ig: min_ig = ig min_key = (i, j) left_over.append(check_list[min_key[0]].member_index[min_key[1]]) del check_list[min_key[0]].member_index[min_key[1]] if flag is not True: # Note: if flag == False, means that split is unsuccessful. # So we need to pop a bucket from buckets to merge with left_over # The bucket poped is larger than K, so left over will larger than K parent_bucket.splitable = False try: min_ig = 10000000000000000 min_key = '' for k, t in buckets.items(): ig = information_gain(t, pick_value) if ig < min_ig: min_ig = ig min_key = k left_over.extend(buckets[min_key].member_index[:]) del buckets[min_key] except: print "Error: buckets is empty" pdb.set_trace() parent_bucket.member_index = left_over[:] str_value = list_to_str(parent_bucket.value) buckets[str_value] = parent_bucket
def check_diversity(data): """ check the distinct SA values in dataset """ sa_dict = {} for record in data: try: sa_value = list_to_str(record[-1]) except AttributeError: sa_value = record[-1] try: sa_dict[sa_value] += 1 except KeyError: sa_dict[sa_value] = 1 return len(sa_dict.keys())
def check_diversity(data): """ check the distinct SA values in dataset """ sensitive_attribute_dict = {} for record in data: try: sensitive_attribute_value = list_to_str(record[-1]) except AttributeError: sensitive_attribute_value = record[-1] try: sensitive_attribute_dict[sensitive_attribute_value] += 1 except KeyError: sensitive_attribute_dict[sensitive_attribute_value] = 1 return len(sensitive_attribute_dict)
def get_tran_range(att_tree, tran): temp = list_to_str(tran) try: return COVER_DICT[temp] except: pass cover_dict = dict() for item in tran: prob = 1.0 leaf_num = len(att_tree[item]) if leaf_num > 0: prob = prob / leaf_num for t in att_tree[item].leaf.keys(): cover_dict[t] = prob else: cover_dict[item] = prob COVER_DICT[-1][temp] = cover_dict return cover_dict
def pick_node(bucket): """find the split node with largest information gain. Then split bucket to buckets accroding to this node. """ buckets = {} result_list = [] max_ig = -10000 max_value = "" check_list = [t for t in bucket.value if t not in bucket.split_list] for t in check_list: if len(ATT_TREES[t].child) != 0: ig = information_gain(bucket, t) if ig > max_ig: max_ig = ig max_value = t # begin to expand node on pick_value if max_value == "": print "Error: list empty!!" return ("", {}) # get index of max_value index = bucket.value.index(max_value) child_value = [t.value for t in ATT_TREES[max_value].child] for i in range(1, len(child_value) + 1): temp = combinations(child_value, i) temp = [list(t) for t in temp] result_list.extend(temp) # generate child buckets child_level = bucket.level[:] child_value = bucket.value[:] now_level = bucket.level[index] + 1 del child_level[index] del child_value[index] for temp in result_list: temp_level = child_level[:] temp_value = child_value[:] for t in temp: temp_level.insert(index, now_level) temp_value.insert(index, t) str_value = list_to_str(temp) buckets[str_value] = Bucket([], temp_value, temp_level) bucket.split_list.append(max_value) return (max_value, buckets)
def pick_node(bucket): """find the split node with largest information gain. Then split bucket to buckets accroding to this node. """ buckets = {} result_list = [] max_ig = -10000 max_value = '' check_list = [t for t in bucket.value if t not in bucket.split_list] for t in check_list: if len(ATT_TREES[t].child) != 0: ig = information_gain(bucket, t) if ig > max_ig: max_ig = ig max_value = t # begin to expand node on pick_value if max_value == '': print "Error: list empty!!" return ('', {}) # get index of max_value index = bucket.value.index(max_value) child_value = [t.value for t in ATT_TREES[max_value].child] for i in range(1, len(child_value) + 1): temp = combinations(child_value, i) temp = [list(t) for t in temp] result_list.extend(temp) # generate child buckets child_level = bucket.level[:] child_value = bucket.value[:] now_level = bucket.level[index] + 1 del child_level[index] del child_value[index] for temp in result_list: temp_level = child_level[:] temp_value = child_value[:] for t in temp: temp_level.insert(index, now_level) temp_value.insert(index, t) str_value = list_to_str(temp) buckets[str_value] = Bucket([], temp_value, temp_level) bucket.split_list.append(max_value) return (max_value, buckets)
def average_relative_error_1m(att_trees, data, result, qd=DEFAULT_QD, s=DEFAULT_S): """return average relative error of anonymized microdata, qd denote the query dimensionality, b denotes selection of query """ if _DEBUG: print "qd=%d s=%d" % (qd, s) print "size of raw data %d" % len(data) print "size of result data %d" % len(result) gen_data = get_result_cover_1m(att_trees, result) are = 0.0 len_att = len(att_trees) blist = [] att_roots = [t['*'] for t in att_trees] att_cover = [t.cover.keys() for t in att_roots] SA_set = {} # remove duplicate SA sets, only keep str values for temp in data: str_temp = list_to_str(temp[-1]) try: SA_set[str_temp] except: SA_set[str_temp] = temp[-1] att_cover[-1] = SA_set.values() seed = math.pow(s * 1.0 / 100, 1.0 / (qd + 1)) # transform generalized result to coverage # compute b for i in range(len_att): blist.append(int(math.ceil(len(att_roots[i]) * seed))) if _DEBUG: print "b %s" % blist # query times, normally it's 1000. But query 1000 need more than 10h # so we limited query times to 100 zeroare, turn = 0, 0 for turn in range(1, QUERY_TIME + 1): att_select = [] value_select = [] i = 0 # select QI att att_select = random.sample(range(0, len_att - 1), qd) # append SA. So len(att_select) == qd+1 att_select.append(len_att - 1) if _DEBUG: print "ARE %d" % turn print "Att select %s" % att_select for i in range(qd + 1): index = att_select[i] temp = [] count = 0 temp = random.sample(att_cover[index], blist[index]) value_select.append(temp) # pdb.set_trace() act = count_query_1m(data, att_select, value_select) if act != 0: est = est_query_1m(gen_data, att_select, value_select) are += abs(act - est) * 1.0 / act else: zeroare += 1 if turn - zeroare == FAST_BREAK: break if _DEBUG: print "Times = %d when Query on microdata is Zero" % zeroare if turn == zeroare: print "Error: all act ==0" return 0 return are / (turn - zeroare)
def average_relative_error(att_trees, data, result, qd=DEFAULT_QD, s=DEFAULT_S): """return average relative error of anonmized microdata, qd denote the query dimensionality, b denot seleciton of query """ if _DEBUG: print "qd=%d s=%d" % (qd, s) print "size of raw data %d" % len(data) print "size of result data %d" % len(result) gen_data = get_result_cover(att_trees, result) are = 0.0 len_att = len(att_trees) blist = [] att_roots = [t['*'] for t in att_trees] att_cover = [t.cover.keys() for t in att_roots] SA_set = {} # remove duplicate SA sets, only keep str values for temp in data: str_temp = list_to_str(temp[-1]) try: SA_set[str_temp] except: SA_set[str_temp] = temp[-1] att_cover[-1] = SA_set.values() seed = math.pow(s * 1.0 / 100, 1.0 / (qd + 1)) # transform generalized result to coverage # compute b for i in range(len_att): blist.append(int(math.ceil(len(att_roots[i]) * seed))) if _DEBUG: print "b %s" % blist # query times, normally it's 1000. But query 1000 need more than 10h # so we limited query times to 100 zeroare = 0 for turn in range(1, QUERY_TIME + 1): att_select = [] value_select = [] i = 0 # select QI att att_select = random.sample(range(0, len_att - 1), qd) # append SA. So len(att_select) == qd+1 att_select.append(len_att - 1) if _DEBUG: print "ARE %d" % turn print "Att select %s" % att_select for i in range(qd + 1): index = att_select[i] temp = [] count = 0 temp = random.sample(att_cover[index], blist[index]) value_select.append(temp) # pdb.set_trace() act = count_query(data, att_select, value_select) if act != 0: est = est_query(gen_data, att_select, value_select) are += abs(act - est) * 1.0 / act else: zeroare += 1 if turn - zeroare == FAST_BREAK: break if _DEBUG: print "Times = %d when Query on microdata is Zero" % zeroare if turn == zeroare: print "Error: all act ==0" return 0 return are / (turn - zeroare)