Exemple #1
0
    def __init__(self, before_db, after_db, independent_attrs):
        self.before_db = before_db
        self.after_db = after_db
        self.__sensitive_attrs = []
        self.attrs = independent_attrs

        # load the before database
        with open(before_db, 'r') as f:
            reader = csv.reader(f)
            before_attrs = next(reader)
            self.before_all_data = []
            for line in reader:
                row = [toolkit.str2num(v) for a, v in zip(before_attrs, line) if a in independent_attrs]
                self.before_all_data.append(row)

        # load the after database
        with open(after_db, 'r') as f:
            reader = csv.reader(f)
            after_attrs = next(reader)
            self.after_all_data = []
            for line in reader:
                row = [toolkit.str2num(v) for a, v in zip(after_attrs, line) if a in independent_attrs]
                self.after_all_data.append(row)

        # discrete the attributes...
        # determine the bin_ranges
        self.bin_ranges = dict()
        for attr, col in zip(independent_attrs, zip(*self.before_all_data)):
            self.bin_ranges[attr] = toolkit.binrange(col)
Exemple #2
0
    def __init__(self, before_db, after_db):
        self.before_db = before_db
        self.after_db = after_db
        self.__sensitive_attrs = []

        # load the before database
        with open(before_db, 'r') as f:
            reader = csv.reader(f)
            self.before_attrs = next(reader)
            self.before_all_data = []
            for line in reader:
                self.before_all_data.append(line)
            self.before_all_data = [map(toolkit.str2num, row) for row in self.before_all_data]  # str to numeric

        # load the after database
        with open(after_db, 'r') as f:
            reader = csv.reader(f)
            self.after_attrs = next(reader)
            self.after_all_data = []
            for line in reader:
                self.after_all_data.append(line)
            self.after_all_data = [map(toolkit.str2num, row) for row in self.after_all_data]  # str to numeric

        # discrete the attributes...
        # determine the bin_ranges
        self.bin_ranges = dict()
        for attr in self.after_attrs[:-1]:
            temp = self.before_attrs.index(attr)
            col = [original_data_row[temp] for original_data_row in self.before_all_data]
            self.bin_ranges[attr] = toolkit.binrange(col)
def data_set_split(model):
    """
    split the data as testing set and non-testing set (training set)
    NOTE: handing the dependent variable here.
    :param model: name of the model
    """
    # load the original data
    with open('db4school/raw/' + model + '.csv', 'r') as db:
        reader = csv.reader(db)
        head = next(reader)
        all_original_data = []
        for line in reader:
            all_original_data.append(line)

    # discrete the independent variable
    classes = [i[-1] for i in all_original_data]  # last column in the origin csv file
    classes = map(toolkit.str2num, classes)
    if 0 in classes:
        # binary the classification
        classes = [int(bool(int(c))) for c in classes]
    else:
        from toolkit import binrange
        slot = binrange(classes)
        tmp_c = list()
        for c in classes:
            cursor = 0
            for i in slot:
                if c > i:
                    cursor += 1
                else:
                    break
            tmp_c.append(cursor)
        classes = tmp_c

    for l, c in zip(all_original_data, classes):
        l[-1] = c
    # discrete done

    # split the data body
    random.shuffle(all_original_data)
    line = int(len(all_original_data) * (1 - settings.test_set_ratio))

    # write the train set
    with open('db4school/train/' + model + '.csv', 'wb') as f:
        writer = csv.writer(f)
        writer.writerows([head])
        writer.writerows(all_original_data[0:line])

    # write the test set
    with open('db4school/test/' + model + '.csv', 'wb') as f:
        writer = csv.writer(f)
        writer.writerows([head])
        writer.writerows(all_original_data[line:])
Exemple #4
0
def cliff_core(data, percentage, obj_as_binary, handled_obj=False):
    """
    data has no header, only containing the record attributes
    :return the cliffed data INDICES(part of the input data)
    """

    if len(data) < 50:
        logging.debug("no enough data to cliff. return the whole dataset")
        return range(len(data))

    # percentage /= 100 if percentage > 1 else 1

    classes = map(toolkit.str2num, zip(*data)[-1])

    if not handled_obj:
        if obj_as_binary:
            classes = [1 if i > 0 else 0 for i in classes]
        else:
            classes = toolkit.apply_bin_range(classes)

    data_power = list()  # will be 2D list (list of list)

    for col in zip(*data):
        col = map(toolkit.str2num, col)
        E = toolkit.binrange(col)
        data_power.append(power(col, classes, E))

    data_power = map(list, zip(*data_power))  # transposing the data power
    row_sum = [sum(row) for row in data_power]

    index = range(len(data))
    zips = zip(data, classes, row_sum, index)

    output = list()
    for cls in set(classes):
        matched = filter(lambda z: z[1] == cls, zips)
        random.shuffle(matched)
        matched = sorted(matched, key=lambda z: z[2], reverse=True)

        if len(matched) < 5:
            output.extend([m[3] for m in matched])  # all saved
            continue

        for i in range(int(len(matched) * percentage)):
            output.append(matched[i][3])
    return sorted(output)
Exemple #5
0
def cliff_core(data):
    """
    data has no header, only containing the record attributes
    :return the cliffed data (part of the input data)
    """

    if len(data) < 50:
        logging.debug("no enough data to cliff. return the whole dataset")
        return data

    percentage = settings.CLIFF_percentage
    percentage /= 100 if percentage > 1 else 1

    classes = map(toolkit.str2num, zip(*data)[-1])
    data_power = list()  # will be 2D list (list of list)
    for col in zip(*data):
        col = map(toolkit.str2num, col)
        E = toolkit.binrange(col)
        data_power.append(power(col, classes, E))

    data_power = map(list, zip(*data_power))  # transposing the data power
    row_sum = [sum(row) for row in data_power]

    zips = zip(data, classes, row_sum)

    output = list()
    for cls in set(classes):
        matched = filter(lambda z: z[1] == cls, zips)
        random.shuffle(matched)
        matched = sorted(matched, key=lambda z:z[2], reverse=True)

        if len(matched) < 5:
            output.extend([m[0] for m in matched])  # all saved
            continue

        for i in range(int(len(matched)*percentage)):
            output.append(matched[i][0])

    return output
def apriori_cmpr(model, org_folder, ptz_folder):
    """
    Note:
    ignore the class attribute. just focus on the independent attributes
    :param model:
    :param org_folder:
    :param ptz_folder:
    :return:
    """
    # load the data sets
    org_attrs, org_data = toolkit.load_csv(org_folder, model)
    org_data = map(lambda r: map(toolkit.str2num, r), org_data)

    ptz_attrs, ptz_data = toolkit.load_csv(ptz_folder, model)
    ptz_data = map(lambda r: map(toolkit.str2num, r), ptz_data)
    ptz_data = toolkit.del_col_in_table(ptz_data, -1)

    # delete the useless columns
    attributes = settings.record_attrs
    org_dataT = map(list, zip(*org_data))
    org_dataT = [col for col, a1 in zip(org_dataT, org_attrs) if a1 in attributes]
    org_data = map(list, zip(*org_dataT))

    # discretize the data
    # translate the continuous attribute into 'attr+level'

    dis_org_data = []
    dis_ptz_data = []
    # ranges_dict = dict()  # for backup

    for attr_name, col1, col2 in zip(attributes, zip(*org_data), zip(*ptz_data)):
        col1 = list(col1)
        col2 = list(col2)

        col = col1 + col2  # NOTE: put two dataset together
        ranges = toolkit.binrange(col)
        # ranges_dict[attr_name] = ranges

        tags = []
        for element in col1:
            for cursor, upper_bound in enumerate(ranges):
                if upper_bound >= element:
                    break
            # lower_bound = ranges[max(cursor-1, 0)]
            # mid = (upper_bound + lower_bound) / 2
            # if type(mid) is float:
            #     mid = round(mid, 2)
            #
            # tags.append(attr_name+':' + str(mid))
            tags.append(attr_name + ':' + str(cursor))
        dis_org_data.append(tags)

        tags = []
        for element in col2:
            for cursor, upper_bound in enumerate(ranges):
                if upper_bound >= element:
                    break
            tags.append(attr_name + ':' + str(cursor))
        dis_ptz_data.append(tags)

    dis_org_data = map(list, zip(*dis_org_data))
    dis_ptz_data = map(list, zip(*dis_ptz_data))

    logging.info("Database discretization done.")

    org_iter = dataset_iter(dis_org_data)
    ptz_iter = dataset_iter(dis_ptz_data)

    items_org, rules_org = runApriori(org_iter, settings.apriori_min_support, settings.apriori_min_confidence)
    items_ptz, rules_ptz = runApriori(ptz_iter, settings.apriori_min_support, settings.apriori_min_confidence)

    return items_org, items_ptz, rules_org, rules_ptz, dis_org_data, dis_ptz_data