Exemple #1
0
def check_L_diversity(partition):
    """check if partition satisfy l-diversity
    return True if satisfy, False if not.
    """
    sa_dict = {}
    if len(partition) < GL_L:
        return False
    if isinstance(partition, Partition):
        records_set = partition.member
    else:
        records_set = partition
    num_record = len(records_set)
    for record in records_set:
        sa_value = list_to_str(record[-1])
        try:
            sa_dict[sa_value] += 1
        except KeyError:
            sa_dict[sa_value] = 1
    if len(sa_dict.keys()) < GL_L:
        return False
    for sa in sa_dict.keys():
        # if any SA value appear more than |T|/l,
        # the partition does not satisfy l-diversity
        if sa_dict[sa] > 1.0 * num_record / GL_L:
            return False
    return True
Exemple #2
0
def build_SA_bucket(data):
    """
    build SA buckets and a heap sorted by number of records in bucket
    """
    buckets = {}
    bucket_heap = []
    # Assign SA into buckets
    for record in data:
        if isinstance(data[0][-1], list):
            # rt data
            sa_value = list_to_str(record[-1])
        else:
            # relational data
            sa_value = record[-1]
        try:
            buckets[sa_value].append(record)
        except KeyError:
            buckets[sa_value] = [record]
    # random shuffle records in buckets
    # make pop random
    for key in buckets.keys():
        random.shuffle(buckets[key])
    # group stage
    # each round choose l largest buckets, then pop
    # an element from these buckets to form a group
    # We use heap to sort buckets.
    for i, bucketed_record in enumerate(buckets.values()):
        # push to heap reversely
        length = len(bucketed_record) * -1
        if length == 0:
            continue
        heapq.heappush(bucket_heap, (length, SABucket(bucketed_record, i)))
    return buckets, bucket_heap
def distribute_data(bucket, buckets, pick_value):
    """distribute records from parent_bucket to buckets (splited buckets)
    accroding to records elements.
    """
    if len(buckets) == 0:
        print "Error: buckets is empty!"
        return
    data_index = bucket.member_index[:]
    for temp in data_index:
        gen_list = []
        for t in DATA[temp]:
            treelist = PARENT_LIST[t]
            try:
                pos = treelist.index(pick_value)
                # if covered, then replaced with new value
                if pos > 0:
                    gen_list.append(treelist[pos - 1])
                else:
                    print "Error: pick node is leaf, which cannot be splited"
            except:
                continue
        gen_list = list(set(gen_list))
        # sort to ensure the order
        str_value = list_to_str(gen_list)
        try:
            buckets[str_value].member_index.append(temp)
        except:
            pdb.set_trace()
            print "Error: Cannot find key."
def NCP(mid):
    """Compute NCP (Normalized Certainty Penalty)
    when generate record to middle.
    """
    ncp = 0.0
    # exclude SA values(last one type [])
    list_key = list_to_str(mid)
    try:
        return NCP_CACHE[list_key]
    except KeyError:
        pass
    for i in range(QI_LEN):
        # if leaf_num of numerator is 1, then NCP is 0
        width = 0.0
        if IS_CAT[i] is False:
            try:
                float(mid[i])
            except ValueError:
                temp = mid[i].split(',')
                width = float(temp[1]) - float(temp[0])
        else:
            width = len(ATT_TREES[i][mid[i]]) * 1.0
        width /= QI_RANGE[i]
        ncp += width
    NCP_CACHE[list_key] = ncp
    return ncp
def distribute_data(bucket, buckets, pick_value):
    """distribute records from parent_bucket to buckets (splited buckets)
    accroding to records elements.
    """
    if len(buckets) == 0:
        print "Error: buckets is empty!"
        return
    data_index = bucket.member_index[:]
    for temp in data_index:
        gen_list = []
        for t in DATA[temp]:
            treelist = PARENT_LIST[t]
            try:
                pos = treelist.index(pick_value)
                # if covered, then replaced with new value
                if pos > 0:
                    gen_list.append(treelist[pos - 1])
                else:
                    print "Error: pick node is leaf, which cannot be splited"
            except:
                continue
        gen_list = list(set(gen_list))
        # sort to ensure the order
        str_value = list_to_str(gen_list)
        try:
            buckets[str_value].member_index.append(temp)
        except:
            pdb.set_trace()
            print "Error: Cannot find key."
def check_L_diversity(partition):
    """check if partition satisfy l-diversity
    return True if satisfy, False if not.
    """
    sa_dict = {}
    if len(partition) < GL_L:
        return False
    if isinstance(partition, Partition):
        records_set = partition.member
    else:
        records_set = partition
    num_record = len(records_set)
    for record in records_set:
        sa_value = list_to_str(record[-1])
        try:
            sa_dict[sa_value] += 1
        except KeyError:
            sa_dict[sa_value] = 1
    if len(sa_dict.keys()) < GL_L:
        return False
    for sa in sa_dict.keys():
        # if any SA value appear more than |T|/l,
        # the partition does not satisfy l-diversity
        if sa_dict[sa] > 1.0 * num_record / GL_L:
            return False
    return True
def check_L_diversity(partition):
    """check if partition satisfy l-diversity
    return True if satisfy, False if not.
    """
    sensitive_attribute_dict = {}
    if len(partition) < GLOBAL_L_VALUE:
        return False
    if isinstance(partition, Partition):
        records_set = partition.member
    else:
        records_set = partition
    number_of_records = len(records_set)
    for record in records_set:
        try:
            sensitive_attribute_value = list_to_str(record[-1])
        except AttributeError:
            sensitive_attribute_value = record[-1]
        try:
            sensitive_attribute_dict[sensitive_attribute_value] += 1
        except KeyError:
            sensitive_attribute_dict[sensitive_attribute_value] = 1
    if len(sensitive_attribute_dict.keys()) < GLOBAL_L_VALUE:
        return False
    if len(sensitive_attribute_dict) >= GLOBAL_L_VALUE:
        return True
    return False
def balance_partitions(parent_bucket, buckets, K, pick_value):
    """handel buckets with less than K records
    """
    global RESULT
    left_over = []
    for k, t in buckets.items():
        if len(t.member_index) < K:
            # add records of buckets with less than K elemnts
            # to left_over partition
            left_over.extend(t.member_index[:])
            del buckets[k]
    if len(left_over) == 0:
        # left over bucket is empty, skip balance step
        return
    # re-distribute transactions with least information gain from
    # buckets over k to left_over, to enshure number of
    # records in left_over is larger than K
    # using flag to denote if re-distribute is successful or not
    flag = True
    while len(left_over) < K:
        # each iterator pick least information gain transaction from buckets over K
        check_list = [t for t in buckets.values() if len(t.member_index) > K]
        if len(check_list) == 0:
            flag = False
            break
        min_ig = 10000000000000000
        min_key = (0, 0)
        for i, temp in enumerate(check_list):
            for j, t in enumerate(temp.member_index):
                ig = trans_information_gain(DATA[t], pick_value)
                if ig < min_ig:
                    min_ig = ig
                    min_key = (i, j)
        left_over.append(check_list[min_key[0]].member_index[min_key[1]])
        del check_list[min_key[0]].member_index[min_key[1]]
    if flag is not True:
        # Note: if flag == False, means that split is unsuccessful.
        # So we need to pop a bucket from buckets to merge with left_over
        # The bucket poped is larger than K, so left over will larger than K
        parent_bucket.splitable = False
        try:
            min_ig = 10000000000000000
            min_key = ""
            for k, t in buckets.items():
                ig = information_gain(t, pick_value)
                if ig < min_ig:
                    min_ig = ig
                    min_key = k
            left_over.extend(buckets[min_key].member_index[:])
            del buckets[min_key]
        except:
            print "Error: buckets is empty"
            pdb.set_trace()
    parent_bucket.member_index = left_over[:]
    str_value = list_to_str(parent_bucket.value)
    buckets[str_value] = parent_bucket
def balance_partitions(parent_bucket, buckets, K, pick_value):
    """handel buckets with less than K records
    """
    global RESULT
    left_over = []
    for k, t in buckets.items():
        if len(t.member_index) < K:
            # add records of buckets with less than K elemnts
            # to left_over partition
            left_over.extend(t.member_index[:])
            del buckets[k]
    if len(left_over) == 0:
        # left over bucket is empty, skip balance step
        return
    # re-distribute transactions with least information gain from
    # buckets over k to left_over, to enshure number of
    # records in left_over is larger than K
    # using flag to denote if re-distribute is successful or not
    flag = True
    while len(left_over) < K:
        # each iterator pick least information gain transaction from buckets over K
        check_list = [t for t in buckets.values() if len(t.member_index) > K]
        if len(check_list) == 0:
            flag = False
            break
        min_ig = 10000000000000000
        min_key = (0, 0)
        for i, temp in enumerate(check_list):
            for j, t in enumerate(temp.member_index):
                ig = trans_information_gain(DATA[t], pick_value)
                if ig < min_ig:
                    min_ig = ig
                    min_key = (i, j)
        left_over.append(check_list[min_key[0]].member_index[min_key[1]])
        del check_list[min_key[0]].member_index[min_key[1]]
    if flag is not True:
        # Note: if flag == False, means that split is unsuccessful.
        # So we need to pop a bucket from buckets to merge with left_over
        # The bucket poped is larger than K, so left over will larger than K
        parent_bucket.splitable = False
        try:
            min_ig = 10000000000000000
            min_key = ''
            for k, t in buckets.items():
                ig = information_gain(t, pick_value)
                if ig < min_ig:
                    min_ig = ig
                    min_key = k
            left_over.extend(buckets[min_key].member_index[:])
            del buckets[min_key]
        except:
            print "Error: buckets is empty"
            pdb.set_trace()
    parent_bucket.member_index = left_over[:]
    str_value = list_to_str(parent_bucket.value)
    buckets[str_value] = parent_bucket
def check_diversity(data):
    """
    check the distinct SA values in dataset
    """
    sa_dict = {}
    for record in data:
        try:
            sa_value = list_to_str(record[-1])
        except AttributeError:
            sa_value = record[-1]
        try:
            sa_dict[sa_value] += 1
        except KeyError:
            sa_dict[sa_value] = 1
    return len(sa_dict.keys())
def check_diversity(data):
    """
    check the distinct SA values in dataset
    """
    sensitive_attribute_dict = {}
    for record in data:
        try:
            sensitive_attribute_value = list_to_str(record[-1])
        except AttributeError:
            sensitive_attribute_value = record[-1]
        try:
            sensitive_attribute_dict[sensitive_attribute_value] += 1
        except KeyError:
            sensitive_attribute_dict[sensitive_attribute_value] = 1
    return len(sensitive_attribute_dict)
def get_tran_range(att_tree, tran):
    temp = list_to_str(tran)
    try:
        return COVER_DICT[temp]
    except:
        pass
    cover_dict = dict()
    for item in tran:
        prob = 1.0
        leaf_num = len(att_tree[item])
        if leaf_num > 0:
            prob = prob / leaf_num
            for t in att_tree[item].leaf.keys():
                cover_dict[t] = prob
        else:
            cover_dict[item] = prob
    COVER_DICT[-1][temp] = cover_dict
    return cover_dict
Exemple #13
0
def get_tran_range(att_tree, tran):
    temp = list_to_str(tran)
    try:
        return COVER_DICT[temp]
    except:
        pass
    cover_dict = dict()
    for item in tran:
        prob = 1.0
        leaf_num = len(att_tree[item])
        if leaf_num > 0:
            prob = prob / leaf_num
            for t in att_tree[item].leaf.keys():
                cover_dict[t] = prob
        else:
            cover_dict[item] = prob
    COVER_DICT[-1][temp] = cover_dict
    return cover_dict
def pick_node(bucket):
    """find the split node with largest information gain.
    Then split bucket to buckets accroding to this node.
    """
    buckets = {}
    result_list = []
    max_ig = -10000
    max_value = ""
    check_list = [t for t in bucket.value if t not in bucket.split_list]
    for t in check_list:
        if len(ATT_TREES[t].child) != 0:
            ig = information_gain(bucket, t)
            if ig > max_ig:
                max_ig = ig
                max_value = t
    # begin to expand node on pick_value
    if max_value == "":
        print "Error: list empty!!"
        return ("", {})
    # get index of max_value
    index = bucket.value.index(max_value)
    child_value = [t.value for t in ATT_TREES[max_value].child]
    for i in range(1, len(child_value) + 1):
        temp = combinations(child_value, i)
        temp = [list(t) for t in temp]
        result_list.extend(temp)
    # generate child buckets
    child_level = bucket.level[:]
    child_value = bucket.value[:]
    now_level = bucket.level[index] + 1
    del child_level[index]
    del child_value[index]
    for temp in result_list:
        temp_level = child_level[:]
        temp_value = child_value[:]
        for t in temp:
            temp_level.insert(index, now_level)
            temp_value.insert(index, t)
        str_value = list_to_str(temp)
        buckets[str_value] = Bucket([], temp_value, temp_level)
    bucket.split_list.append(max_value)
    return (max_value, buckets)
def pick_node(bucket):
    """find the split node with largest information gain.
    Then split bucket to buckets accroding to this node.
    """
    buckets = {}
    result_list = []
    max_ig = -10000
    max_value = ''
    check_list = [t for t in bucket.value if t not in bucket.split_list]
    for t in check_list:
        if len(ATT_TREES[t].child) != 0:
            ig = information_gain(bucket, t)
            if ig > max_ig:
                max_ig = ig
                max_value = t
    # begin to expand node on pick_value
    if max_value == '':
        print "Error: list empty!!"
        return ('', {})
    # get index of max_value
    index = bucket.value.index(max_value)
    child_value = [t.value for t in ATT_TREES[max_value].child]
    for i in range(1, len(child_value) + 1):
        temp = combinations(child_value, i)
        temp = [list(t) for t in temp]
        result_list.extend(temp)
    # generate child buckets
    child_level = bucket.level[:]
    child_value = bucket.value[:]
    now_level = bucket.level[index] + 1
    del child_level[index]
    del child_value[index]
    for temp in result_list:
        temp_level = child_level[:]
        temp_value = child_value[:]
        for t in temp:
            temp_level.insert(index, now_level)
            temp_value.insert(index, t)
        str_value = list_to_str(temp)
        buckets[str_value] = Bucket([], temp_value, temp_level)
    bucket.split_list.append(max_value)
    return (max_value, buckets)
Exemple #16
0
def average_relative_error_1m(att_trees,
                              data,
                              result,
                              qd=DEFAULT_QD,
                              s=DEFAULT_S):
    """return average relative error of anonymized microdata,
    qd denote the query dimensionality, b denotes selection of query
    """
    if _DEBUG:
        print "qd=%d s=%d" % (qd, s)
        print "size of raw data %d" % len(data)
        print "size of result data %d" % len(result)
    gen_data = get_result_cover_1m(att_trees, result)
    are = 0.0
    len_att = len(att_trees)
    blist = []
    att_roots = [t['*'] for t in att_trees]
    att_cover = [t.cover.keys() for t in att_roots]
    SA_set = {}
    # remove duplicate SA sets, only keep str values
    for temp in data:
        str_temp = list_to_str(temp[-1])
        try:
            SA_set[str_temp]
        except:
            SA_set[str_temp] = temp[-1]
    att_cover[-1] = SA_set.values()
    seed = math.pow(s * 1.0 / 100, 1.0 / (qd + 1))
    # transform generalized result to coverage
    # compute b
    for i in range(len_att):
        blist.append(int(math.ceil(len(att_roots[i]) * seed)))
    if _DEBUG:
        print "b %s" % blist
    # query times, normally it's 1000. But query 1000 need more than 10h
    # so we limited query times to 100
    zeroare, turn = 0, 0
    for turn in range(1, QUERY_TIME + 1):
        att_select = []
        value_select = []
        i = 0
        # select QI att
        att_select = random.sample(range(0, len_att - 1), qd)
        # append SA. So len(att_select) == qd+1
        att_select.append(len_att - 1)
        if _DEBUG:
            print "ARE %d" % turn
            print "Att select %s" % att_select
        for i in range(qd + 1):
            index = att_select[i]
            temp = []
            count = 0
            temp = random.sample(att_cover[index], blist[index])
            value_select.append(temp)
        # pdb.set_trace()
        act = count_query_1m(data, att_select, value_select)
        if act != 0:
            est = est_query_1m(gen_data, att_select, value_select)
            are += abs(act - est) * 1.0 / act
        else:
            zeroare += 1
        if turn - zeroare == FAST_BREAK:
            break
    if _DEBUG:
        print "Times = %d when Query on microdata is Zero" % zeroare
    if turn == zeroare:
        print "Error: all act ==0"
        return 0
    return are / (turn - zeroare)
def average_relative_error(att_trees, data, result, qd=DEFAULT_QD, s=DEFAULT_S):
    """return average relative error of anonmized microdata,
    qd denote the query dimensionality, b denot seleciton of query
    """
    if _DEBUG:
        print "qd=%d s=%d" % (qd, s)
        print "size of raw data %d" % len(data)
        print "size of result data %d" % len(result)
    gen_data = get_result_cover(att_trees, result)
    are = 0.0
    len_att = len(att_trees)
    blist = []
    att_roots = [t['*'] for t in att_trees]
    att_cover = [t.cover.keys() for t in att_roots]
    SA_set = {}
    # remove duplicate SA sets, only keep str values
    for temp in data:
        str_temp = list_to_str(temp[-1])
        try:
            SA_set[str_temp]
        except:
            SA_set[str_temp] = temp[-1]
    att_cover[-1] = SA_set.values()
    seed = math.pow(s * 1.0 / 100, 1.0 / (qd + 1))
    # transform generalized result to coverage
    # compute b
    for i in range(len_att):
        blist.append(int(math.ceil(len(att_roots[i]) * seed)))
    if _DEBUG:
        print "b %s" % blist
    # query times, normally it's 1000. But query 1000 need more than 10h
    # so we limited query times to 100
    zeroare = 0
    for turn in range(1, QUERY_TIME + 1):
        att_select = []
        value_select = []
        i = 0
        # select QI att
        att_select = random.sample(range(0, len_att - 1), qd)
        # append SA. So len(att_select) == qd+1
        att_select.append(len_att - 1)
        if _DEBUG:
            print "ARE %d" % turn
            print "Att select %s" % att_select
        for i in range(qd + 1):
            index = att_select[i]
            temp = []
            count = 0
            temp = random.sample(att_cover[index], blist[index])
            value_select.append(temp)
        # pdb.set_trace()
        act = count_query(data, att_select, value_select)
        if act != 0:
            est = est_query(gen_data, att_select, value_select)
            are += abs(act - est) * 1.0 / act
        else:
            zeroare += 1
        if turn - zeroare == FAST_BREAK:
            break
    if _DEBUG:
        print "Times = %d when Query on microdata is Zero" % zeroare
    if turn == zeroare:
        print "Error: all act ==0"
        return 0
    return are / (turn - zeroare)