Beispiel #1
0
def cliff_core(data, percentage, obj_as_binary, handled_obj=False):
    """
    data has no header, only containing the record attributes
    :return the cliffed data INDICES(part of the input data)
    """

    ## Length of data needs to be minimum 50.
    if len(data) < 50:
        logging.debug("no enough data to cliff. return the whole dataset")
        return range(len(data))

    classes = map(toolkit.str2num, zip(*data)[-1])

    ## Binary object for phishing, 1, -1
    if not handled_obj:
        if obj_as_binary:
            classes = [1 if i > 0 else -1 for i in classes]
        else:
            classes = toolkit.apply_bin_range(classes)

    data_power = list()  # will be 2D list (list of list)

    for col in zip(*data):
        col = map(toolkit.str2num, col)

        ## Convert binary to a range
        E = toolkit.binrange_binary(col)
        data_power.append(power(col, classes, E))

    data_power = map(list, zip(*data_power))  # transposing the data power
    row_sum = [sum(row) for row in data_power]

    index = range(len(data))
    zips = zip(data, classes, row_sum, index)

    output = list()
    for cls in set(classes):
        matched = filter(lambda z: z[1] == cls, zips)
        random.shuffle(matched)
        matched = sorted(matched, key=lambda z: z[2], reverse=True)

        if len(matched) < 5:
            output.extend([m[3] for m in matched])  # all saved
            continue

        for i in range(int(len(matched) * percentage)):
            output.append(matched[i][3])
    return sorted(output)
Beispiel #2
0
def morph(attribute_names,
          data_matrix,
          independent_attrs,
          objective_attr,
          objective_as_binary=False,
          data_has_normalized=False,
          alpha=0.15,
          beta=0.35):
    """
    morph is a instance mutation which can shake the instance within the class boundary
    :param attribute_names: the names of attributes, should match the data_matrix
    :param data_matrix: original data
    :param independent_attrs: set up the independent attributes in the dataset. Note: 'name', 'id', etc. might not be
        considered as independent attributes
    :param objective_attr: marking which attribute is the objective to be considered
    :param objective_as_binary: signal to set up whether treat the objective as a binary attribute. Default: False
    :param data_has_normalized: telling whether the data matrix has been normalized.
    :param alpha: morph algorithm parameter
    :param beta: morph algorithm parameter
    :return:
    """

    dataset_t = map(list, zip(*data_matrix))
    dataset = list()
    classes = list()
    for d, a in zip(dataset_t, attribute_names):
        if a in independent_attrs:
            dataset.append(d)
        if a == objective_attr:
            classes = list(d)

    dataset = map(list, zip(*dataset))
    dataset = [map(toolkit.str2num, row) for row in dataset]  # str to numeric
    classes = map(toolkit.str2num, classes)

    if objective_as_binary:
        classes = [1 if i > 0 else 0 for i in classes]
    else:
        classes = toolkit.apply_bin_range(classes)

    is_int = [type(i) is int for i in dataset[0]
              ]  # save. for better representation of the output table

    if data_has_normalized:
        #  adding two instance (all zeros and all ones) so that the normalization and de-normalization process
        #  do not damage the original data
        dataset.append([0] * len(dataset[0]))
        dataset.append([1] * len(dataset[0]))
    '''dataset transposed mode begins...'''
    dataset = map(list, zip(*dataset))  # transpose.
    norm_funcs = []
    denorm_funcs = []

    # normalizing

    for attr_index, attr_elements in enumerate(
            dataset):  # for each attribute elements
        f1, f2 = toolkit.attr_norm(attr_elements)
        norm_funcs.append(f1)
        denorm_funcs.append(f2)
        dataset[attr_index] = map(f1, attr_elements)
    '''dataset mode recover...'''
    dataset = map(list, zip(*dataset))  # transpose again.

    for row_index, row in enumerate(dataset):  # for each row
        heterogeneous_index = [
            i for i in range(len(classes)) if classes[i] != classes[row_index]
        ]
        boundary_dist = min([
            toolkit.euclidean_dist(row, dataset[heg])
            for heg in heterogeneous_index
        ])
        boundary_dist /= math.sqrt(len(independent_attrs) - 1)
        for i in range(len(row)):
            dataset[row_index][i] += boundary_dist * random.uniform(
                alpha, beta) * random.choice([1, -1])  # shake
    '''dataset transposed mode begins...'''
    dataset = map(list, zip(*dataset))  # transpose.
    for attr_index, attr_elements in enumerate(
            dataset):  # for each attribute elements
        dataset[attr_index] = map(denorm_funcs[attr_index],
                                  attr_elements)  # scale to the original
        for i in range(len(dataset[attr_index])):
            if is_int[attr_index]:
                dataset[attr_index][i] = int(round(
                    dataset[attr_index][i]))  # rounding when needed
            else:
                dataset[attr_index][i] = round(dataset[attr_index][i], 4)
    morphed = map(list,
                  zip(*dataset))  # recover to the original mode and finish.
    '''!!morph done!!'''
    if data_has_normalized:
        morphed = morphed[:-2]

    res = list()
    for x, dm in zip(morphed, data_matrix):
        row = list()
        tmp = 0
        for attri, attr in enumerate(attribute_names):
            if attr in independent_attrs:
                row.append(x[tmp])
                tmp += 1
            elif attr == objective_attr:
                row.append(toolkit.str2num(dm[attri]))
            else:
                row.append(dm[attri])
        res.append(row)

    return res
Beispiel #3
0
def add_to_bin(attribute_names,
               try2add_data_matrix,
               independent_attrs,
               objective_attr,
               objective_as_binary=False,
               cliff_percentage=0.4,
               morph_alpha=0.15,
               morph_beta=0.35,
               passing_bin=None):
    """
    LACE2 data paring engine
    :param attribute_names: the names of attributes, should match the data_matrix
    :param try2add_data_matrix: the data anyone is holding
    :param independent_attrs: set up the independent attributes in the dataset. Note: 'name', 'id', etc. might not be
        considered as independent attributes
    :param objective_attr: marking which attribute is the objective to be considered
    :param objective_as_binary: signal to set up whether treat the objective as a binary attribute. Default: False
    :param cliff_percentage: prune rate
    :param morph_alpha:  parameter 1 in morph, defining the shaking degree
    :param morph_beta: parameter 2 in morph, defining the shaking degree
    :param passing_bin: the data get from someone else. Set None if no passing data
    :return: the new passing_bin.
        NOTE: the result must be assigned to another variable. The parameter pointer will NOT be changed
    """
    if passing_bin is not None:
        assert passing_bin[
            0] == attribute_names, "new added table should share the same table with existed BIN data"
    else:
        passing_bin = [attribute_names]

    my = list()
    others = list()

    # prepare for the core independent+dependent dataset
    for attr in independent_attrs:
        col = zip(*try2add_data_matrix)[attribute_names.index(attr)]
        col = map(toolkit.str2num, col)
        my.append(col)

        if len(passing_bin) > 2:
            other_col = zip(*passing_bin[1:])[attribute_names.index(attr)]
            other_col = map(toolkit.str2num, other_col)
            others.append(other_col)

    classes = zip(*try2add_data_matrix)[attribute_names.index(objective_attr)]
    obj = classes[:]
    other_classes = zip(*passing_bin[1:])[attribute_names.index(
        objective_attr)] if len(passing_bin) > 2 else []
    classes = map(toolkit.str2num, classes)
    other_classes = map(toolkit.str2num, other_classes)

    if objective_as_binary:
        classes = [1 if i > 0 else 0 for i in classes]
        other_classes = [1 if i > 0 else 0 for i in other_classes]
    else:
        classes = toolkit.apply_bin_range(classes)
        other_classes = toolkit.apply_bin_range(other_classes)

    my.append(classes)
    others.append(other_classes)
    my = map(list, zip(*my))

    protected_line = copy.deepcopy(my[0])  # saving the data formats!
    others = map(list, zip(*others))

    # normalization process
    norm_funcs, denorm_funcs = list(), list()
    for col in map(list, zip(*my + others)):
        f1, f2 = toolkit.attr_norm(col)
        norm_funcs.append(f1)
        denorm_funcs.append(f2)
    cache = list()

    # normalizing my
    uni_my = list()
    my = map(list, zip(*my))
    for funi, col in enumerate(my[:-1]):
        uni_my.append(map(norm_funcs[funi], col))
    uni_my.append(my[-1])
    my = map(list, zip(*uni_my))

    if len(passing_bin) < 2:
        cache = CLIFF.cliff_core(my,
                                 cliff_percentage,
                                 objective_as_binary,
                                 handled_obj=True)
    else:
        # normalizing others
        uni_others = list()
        others = map(list, zip(*others))
        for funi, col in enumerate(others[:-1]):
            uni_others.append(map(norm_funcs[funi], col))
        uni_others.append(others[-1])
        others = map(list, zip(*uni_others))

        to_submits = CLIFF.cliff_core(my,
                                      cliff_percentage,
                                      objective_as_binary,
                                      handled_obj=True)
        bins = others

        fetch_num = min(len(my) + len(others), 100)
        sampled = random.sample(my + others, fetch_num)
        sampled_obj = zip(*sampled)[-1]
        sampled = toolkit.normalize_cols_for_table(
            [row[:-1] for row in sampled])
        sampled = [i + [j] for i, j in zip(sampled, sampled_obj)]

        inter_class_dist = LeaF.find_distinct_distance(sampled)
        for test in to_submits:
            if LeaF.whether_add_to_private_cache(my[test], bins,
                                                 inter_class_dist):
                cache.append(test)
                # bins.append(my[test])

    if len(cache) == 0:
        return passing_bin

    cache_data = [my[i] for i in cache]
    cache_obj = [obj[i] for i in cache]
    cache_data = MORPH.simplify_morph(cache_data + others, morph_alpha,
                                      morph_beta)[:len(cache_data)]

    for at, i in enumerate(cache):
        h = try2add_data_matrix[i]
        new = cache_data[at]
        new2 = [func(d) for func, d in zip(denorm_funcs, new)]
        new = new2
        c = cache_obj[at]

        row = list()
        new_c = 0
        for h_c, attr in enumerate(attribute_names):
            if attr == objective_attr:
                row.append(c)
                continue
            if attr in independent_attrs:
                row.append(new[new_c])
                new_c += 1
                continue
            else:
                row.append(h[h_c])
        row = map(str, row)
        passing_bin.append(row)
    return passing_bin
Beispiel #4
0
def add_to_bin(attribute_names,
               try2add_data_matrix,
               independent_attrs,
               objective_attr,
               objective_as_binary=False,
               cliff_percentage=0.4,
               morph_alpha=0.15,
               morph_beta=0.35,
               passing_bin=None):
    """
    LACE2 data paring engine
    :param attribute_names: the names of attributes, should match the data_matrix
    :param try2add_data_matrix: the data anyone is holding
    :param independent_attrs: set up the independent attributes in the dataset. Note: 'name', 'id', etc. might not be
        considered as independent attributes
    :param objective_attr: marking which attribute is the objective to be considered
    :param objective_as_binary: signal to set up whether treat the objective as a binary attribute. Default: False
    :param cliff_percentage: prune rate
    :param morph_alpha:  parameter 1 in morph, defining the shaking degree
    :param morph_beta: parameter 2 in morph, defining the shaking degree
    :param passing_bin: the data get from someone else. Set None if no passing data
    :return: the new passing_bin.
        NOTE: the result must be assigned to another variable. The parameter pointer will NOT be changed
    """
    if passing_bin is not None:
        assert (
            passing_bin[0] == attribute_names,
            "new added table should share the same table with existed BIN data"
        )
    else:
        passing_bin = [attribute_names]

    my = list()
    others = list()

    # prepare for the core independent+dependent dataset
    for attr in independent_attrs:
        col = zip(*try2add_data_matrix)[attribute_names.index(attr)]
        my.append(col)
        if len(passing_bin) > 2:
            other_col = zip(*passing_bin[1:])[attribute_names.index(attr)]
            others.append(other_col)

    classes = zip(*try2add_data_matrix)[attribute_names.index(objective_attr)]
    other_classes = zip(*passing_bin[1:])[attribute_names.index(
        objective_attr)] if len(passing_bin) > 2 else []
    classes = map(toolkit.str2num, classes)
    other_classes = map(toolkit.str2num, other_classes)

    if objective_as_binary:
        classes = [1 if i > 0 else 0 for i in classes]
        other_classes = [1 if i > 0 else 0 for i in other_classes]
    else:
        classes = toolkit.apply_bin_range(classes)
        other_classes = toolkit.apply_bin_range(other_classes)

    my.append(classes)
    others.append(other_classes)
    a, b = min(classes + other_classes), max(
        classes + other_classes)  # for revealing the obejective
    my = map(lambda col: map(toolkit.str2num, col), my)
    my = map(list, zip(*my))

    protected_line = copy.deepcopy(my[0])  # saving the data formats!
    others = map(lambda col: map(toolkit.str2num, col), others)
    others = map(list, zip(*others))

    # get the **important** LEAF distance
    fetch_num = min(len(my) + len(others), 100)
    sampled = random.sample(my + others, fetch_num)
    sampled = toolkit.normalize_cols_for_table([row[:-1] for row in sampled])
    inter_class_dist = LeaF.find_distinct_distance(sampled)

    # normalization process
    norm_funcs, denorm_funcs = list(), list()
    for col in map(list, zip(*my + others)):
        f1, f2 = toolkit.attr_norm(col)
        norm_funcs.append(f1)
        denorm_funcs.append(f2)
    cache = list()

    # normalizing my
    uni_my = list()
    my = map(list, zip(*my))
    for funi, col in enumerate(my[:-1]):
        uni_my.append(map(norm_funcs[funi], col))
    uni_my.append(my[-1])
    my = map(list, zip(*uni_my))

    if len(passing_bin) < 2:
        cache = CLIFF.cliff_core(my,
                                 cliff_percentage,
                                 objective_as_binary,
                                 handled_obj=True)
    else:
        # normalizing others
        uni_others = list()
        others = map(list, zip(*others))
        for funi, col in enumerate(others[:-1]):
            uni_others.append(map(norm_funcs[funi], col))
        uni_others.append(others[-1])
        others = map(list, zip(*uni_others))

        to_submits = CLIFF.cliff_core(my,
                                      cliff_percentage,
                                      objective_as_binary,
                                      handled_obj=True)
        bins = others
        for test in to_submits:
            if LeaF.whether_add_to_private_cache(my[test], bins,
                                                 inter_class_dist):
                cache.append(test)
                bins.append(my[test])

    cache_data = [my[i] for i in cache]
    cache_data = MORPH.simplify_morph(cache_data, morph_alpha, morph_beta)

    # remove normalization of cache
    cache_t = list()
    for funi, col in enumerate(zip(*cache_data)[:-1]):
        p = map(denorm_funcs[funi], col)
        if type(protected_line[funi]) is not int:
            p = [round(i, 4) for i in p]
        else:
            p = [int(i) for i in p]
        cache_t.append(p)

    # rescale the objective col
    tmp = [
        toolkit.str2num(i[attribute_names.index(objective_attr)])
        for i in try2add_data_matrix
    ]
    m, M = min(tmp), max(tmp)

    tmp = [m + (i - a) * (M - m) / (b - a) for i in zip(*cache_data)[-1]]
    if type(m) is int:
        tmp = map(lambda x: int(x), tmp)
    else:
        tmp = map(lambda x: round(x, 4), tmp)
    cache_t.append(tmp)

    cache_data = map(list, zip(*cache_t))

    care_attr_index = dict()
    for i, a in enumerate(independent_attrs + [objective_attr]):
        care_attr_index[a] = i

    for line_number, toreplace in zip(cache, cache_data):
        row = list()
        for orgi, attr in enumerate(attribute_names):
            if attr in care_attr_index:
                row.append(toreplace[care_attr_index[attr]])
            else:
                row.append(try2add_data_matrix[line_number][orgi])
        passing_bin.append(row)

    # making everything into string
    for recordi in range(len(passing_bin)):
        record = passing_bin[recordi]
        passing_bin[recordi] = map(str, record)

    return passing_bin