def __init__(self, before_db, after_db, independent_attrs): self.before_db = before_db self.after_db = after_db self.__sensitive_attrs = [] self.attrs = independent_attrs # load the before database with open(before_db, 'r') as f: reader = csv.reader(f) before_attrs = next(reader) self.before_all_data = [] for line in reader: row = [toolkit.str2num(v) for a, v in zip(before_attrs, line) if a in independent_attrs] self.before_all_data.append(row) # load the after database with open(after_db, 'r') as f: reader = csv.reader(f) after_attrs = next(reader) self.after_all_data = [] for line in reader: row = [toolkit.str2num(v) for a, v in zip(after_attrs, line) if a in independent_attrs] self.after_all_data.append(row) # discrete the attributes... # determine the bin_ranges self.bin_ranges = dict() for attr, col in zip(independent_attrs, zip(*self.before_all_data)): self.bin_ranges[attr] = toolkit.binrange(col)
def __init__(self, before_db, after_db): self.before_db = before_db self.after_db = after_db self.__sensitive_attrs = [] # load the before database with open(before_db, 'r') as f: reader = csv.reader(f) self.before_attrs = next(reader) self.before_all_data = [] for line in reader: self.before_all_data.append(line) self.before_all_data = [map(toolkit.str2num, row) for row in self.before_all_data] # str to numeric # load the after database with open(after_db, 'r') as f: reader = csv.reader(f) self.after_attrs = next(reader) self.after_all_data = [] for line in reader: self.after_all_data.append(line) self.after_all_data = [map(toolkit.str2num, row) for row in self.after_all_data] # str to numeric # discrete the attributes... # determine the bin_ranges self.bin_ranges = dict() for attr in self.after_attrs[:-1]: temp = self.before_attrs.index(attr) col = [original_data_row[temp] for original_data_row in self.before_all_data] self.bin_ranges[attr] = toolkit.binrange(col)
def data_set_split(model): """ split the data as testing set and non-testing set (training set) NOTE: handing the dependent variable here. :param model: name of the model """ # load the original data with open('db4school/raw/' + model + '.csv', 'r') as db: reader = csv.reader(db) head = next(reader) all_original_data = [] for line in reader: all_original_data.append(line) # discrete the independent variable classes = [i[-1] for i in all_original_data] # last column in the origin csv file classes = map(toolkit.str2num, classes) if 0 in classes: # binary the classification classes = [int(bool(int(c))) for c in classes] else: from toolkit import binrange slot = binrange(classes) tmp_c = list() for c in classes: cursor = 0 for i in slot: if c > i: cursor += 1 else: break tmp_c.append(cursor) classes = tmp_c for l, c in zip(all_original_data, classes): l[-1] = c # discrete done # split the data body random.shuffle(all_original_data) line = int(len(all_original_data) * (1 - settings.test_set_ratio)) # write the train set with open('db4school/train/' + model + '.csv', 'wb') as f: writer = csv.writer(f) writer.writerows([head]) writer.writerows(all_original_data[0:line]) # write the test set with open('db4school/test/' + model + '.csv', 'wb') as f: writer = csv.writer(f) writer.writerows([head]) writer.writerows(all_original_data[line:])
def cliff_core(data, percentage, obj_as_binary, handled_obj=False): """ data has no header, only containing the record attributes :return the cliffed data INDICES(part of the input data) """ if len(data) < 50: logging.debug("no enough data to cliff. return the whole dataset") return range(len(data)) # percentage /= 100 if percentage > 1 else 1 classes = map(toolkit.str2num, zip(*data)[-1]) if not handled_obj: if obj_as_binary: classes = [1 if i > 0 else 0 for i in classes] else: classes = toolkit.apply_bin_range(classes) data_power = list() # will be 2D list (list of list) for col in zip(*data): col = map(toolkit.str2num, col) E = toolkit.binrange(col) data_power.append(power(col, classes, E)) data_power = map(list, zip(*data_power)) # transposing the data power row_sum = [sum(row) for row in data_power] index = range(len(data)) zips = zip(data, classes, row_sum, index) output = list() for cls in set(classes): matched = filter(lambda z: z[1] == cls, zips) random.shuffle(matched) matched = sorted(matched, key=lambda z: z[2], reverse=True) if len(matched) < 5: output.extend([m[3] for m in matched]) # all saved continue for i in range(int(len(matched) * percentage)): output.append(matched[i][3]) return sorted(output)
def cliff_core(data): """ data has no header, only containing the record attributes :return the cliffed data (part of the input data) """ if len(data) < 50: logging.debug("no enough data to cliff. return the whole dataset") return data percentage = settings.CLIFF_percentage percentage /= 100 if percentage > 1 else 1 classes = map(toolkit.str2num, zip(*data)[-1]) data_power = list() # will be 2D list (list of list) for col in zip(*data): col = map(toolkit.str2num, col) E = toolkit.binrange(col) data_power.append(power(col, classes, E)) data_power = map(list, zip(*data_power)) # transposing the data power row_sum = [sum(row) for row in data_power] zips = zip(data, classes, row_sum) output = list() for cls in set(classes): matched = filter(lambda z: z[1] == cls, zips) random.shuffle(matched) matched = sorted(matched, key=lambda z:z[2], reverse=True) if len(matched) < 5: output.extend([m[0] for m in matched]) # all saved continue for i in range(int(len(matched)*percentage)): output.append(matched[i][0]) return output
def apriori_cmpr(model, org_folder, ptz_folder): """ Note: ignore the class attribute. just focus on the independent attributes :param model: :param org_folder: :param ptz_folder: :return: """ # load the data sets org_attrs, org_data = toolkit.load_csv(org_folder, model) org_data = map(lambda r: map(toolkit.str2num, r), org_data) ptz_attrs, ptz_data = toolkit.load_csv(ptz_folder, model) ptz_data = map(lambda r: map(toolkit.str2num, r), ptz_data) ptz_data = toolkit.del_col_in_table(ptz_data, -1) # delete the useless columns attributes = settings.record_attrs org_dataT = map(list, zip(*org_data)) org_dataT = [col for col, a1 in zip(org_dataT, org_attrs) if a1 in attributes] org_data = map(list, zip(*org_dataT)) # discretize the data # translate the continuous attribute into 'attr+level' dis_org_data = [] dis_ptz_data = [] # ranges_dict = dict() # for backup for attr_name, col1, col2 in zip(attributes, zip(*org_data), zip(*ptz_data)): col1 = list(col1) col2 = list(col2) col = col1 + col2 # NOTE: put two dataset together ranges = toolkit.binrange(col) # ranges_dict[attr_name] = ranges tags = [] for element in col1: for cursor, upper_bound in enumerate(ranges): if upper_bound >= element: break # lower_bound = ranges[max(cursor-1, 0)] # mid = (upper_bound + lower_bound) / 2 # if type(mid) is float: # mid = round(mid, 2) # # tags.append(attr_name+':' + str(mid)) tags.append(attr_name + ':' + str(cursor)) dis_org_data.append(tags) tags = [] for element in col2: for cursor, upper_bound in enumerate(ranges): if upper_bound >= element: break tags.append(attr_name + ':' + str(cursor)) dis_ptz_data.append(tags) dis_org_data = map(list, zip(*dis_org_data)) dis_ptz_data = map(list, zip(*dis_ptz_data)) logging.info("Database discretization done.") org_iter = dataset_iter(dis_org_data) ptz_iter = dataset_iter(dis_ptz_data) items_org, rules_org = runApriori(org_iter, settings.apriori_min_support, settings.apriori_min_confidence) items_ptz, rules_ptz = runApriori(ptz_iter, settings.apriori_min_support, settings.apriori_min_confidence) return items_org, items_ptz, rules_org, rules_ptz, dis_org_data, dis_ptz_data