def find_distinct_distance(normalized_data_set): """ find the median of distance which can distinguish the data. :param normalized_data_set: dataset to calculate. The dataset must be normalized :return: float-distance """ n = len(normalized_data_set) # find out all kinds of classes in the dataset classes = list(set([i[-1]for i in normalized_data_set])) assert len(classes) > 1, "unfortunately all data selected are in same class." classes_index = dict() for c in classes: classes_index[c] = [index for index, i in enumerate(normalized_data_set) if i[-1] == c] distances = [] for data in normalized_data_set: c = data[-1] diff_class_data_indices = [index for index in range(n) if index not in classes_index[c]] distances.append(min([toolkit.euclidean_dist(data, normalized_data_set[index]) for index in diff_class_data_indices])) import numpy median_dist = numpy.median(numpy.array(distances)) logging.debug("The distinguish distance is %f" % median_dist) return median_dist
def find_distinct_distance(normalized_data_set): """ find the median of distance which can distinguish the data. :param normalized_data_set: dataset to calculate. The dataset must be normalized :return: float-distance """ n = len(normalized_data_set) # find out all kinds of classes in the dataset classes = list(set([i[-1] for i in normalized_data_set])) assert len( classes) > 1, "unfortunately all data selected are in same class." classes_index = dict() for c in classes: classes_index[c] = [ index for index, i in enumerate(normalized_data_set) if i[-1] == c ] distances = [] for data in normalized_data_set: c = data[-1] diff_class_data_indices = [ index for index in range(n) if index not in classes_index[c] ] distances.append( min([ toolkit.euclidean_dist(data, normalized_data_set[index]) for index in diff_class_data_indices ])) median_dist = numpy.median(numpy.array(distances)) logging.debug("The distinguish distance is %f" % median_dist) return median_dist
def whether_add_to_private_cache(data_instance, existed_cache, distinguish_distance): """ :param data_instance: this should be normalized :param existed_cache: this should be normalized :param distinguish_distance: :return: """ # TODO whether need to check the class?! for data in existed_cache: if toolkit.euclidean_dist(data_instance, data) <= distinguish_distance: return False return True
def simplify_morph(data, alpha, beta): """ same as MOPRH. But require 1) data is handled, no header, class at last column 2) data has been normalized """ classes = map(list, zip(*data))[-1] data = [i for i in data] for row_index, row in enumerate(data): # for each row heterogeneous_index = [ i for i in range(len(classes)) if classes[i] != classes[row_index] ] boundary_dist = min([ toolkit.euclidean_dist(row, data[heg]) for heg in heterogeneous_index ]) boundary_dist /= math.sqrt(len(data[0]) - 2) for i in range(len(row)): data[row_index][i] += boundary_dist * random.uniform( alpha, beta) * random.choice([1, -1]) # shake return data
def morph(attribute_names, data_matrix, independent_attrs, objective_attr, objective_as_binary=False, data_has_normalized=False, alpha=0.15, beta=0.35): """ morph is a instance mutation which can shake the instance within the class boundary :param attribute_names: the names of attributes, should match the data_matrix :param data_matrix: original data :param independent_attrs: set up the independent attributes in the dataset. Note: 'name', 'id', etc. might not be considered as independent attributes :param objective_attr: marking which attribute is the objective to be considered :param objective_as_binary: signal to set up whether treat the objective as a binary attribute. Default: False :param data_has_normalized: telling whether the data matrix has been normalized. :param alpha: morph algorithm parameter :param beta: morph algorithm parameter :return: """ dataset_t = map(list, zip(*data_matrix)) dataset = list() classes = list() for d, a in zip(dataset_t, attribute_names): if a in independent_attrs: dataset.append(d) if a == objective_attr: classes = list(d) dataset = map(list, zip(*dataset)) dataset = [map(toolkit.str2num, row) for row in dataset] # str to numeric classes = map(toolkit.str2num, classes) if objective_as_binary: classes = [1 if i > 0 else 0 for i in classes] else: classes = toolkit.apply_bin_range(classes) is_int = [type(i) is int for i in dataset[0] ] # save. for better representation of the output table if data_has_normalized: # adding two instance (all zeros and all ones) so that the normalization and de-normalization process # do not damage the original data dataset.append([0] * len(dataset[0])) dataset.append([1] * len(dataset[0])) '''dataset transposed mode begins...''' dataset = map(list, zip(*dataset)) # transpose. norm_funcs = [] denorm_funcs = [] # normalizing for attr_index, attr_elements in enumerate( dataset): # for each attribute elements f1, f2 = toolkit.attr_norm(attr_elements) norm_funcs.append(f1) denorm_funcs.append(f2) dataset[attr_index] = map(f1, attr_elements) '''dataset mode recover...''' dataset = map(list, zip(*dataset)) # transpose again. for row_index, row in enumerate(dataset): # for each row heterogeneous_index = [ i for i in range(len(classes)) if classes[i] != classes[row_index] ] boundary_dist = min([ toolkit.euclidean_dist(row, dataset[heg]) for heg in heterogeneous_index ]) boundary_dist /= math.sqrt(len(independent_attrs) - 1) for i in range(len(row)): dataset[row_index][i] += boundary_dist * random.uniform( alpha, beta) * random.choice([1, -1]) # shake '''dataset transposed mode begins...''' dataset = map(list, zip(*dataset)) # transpose. for attr_index, attr_elements in enumerate( dataset): # for each attribute elements dataset[attr_index] = map(denorm_funcs[attr_index], attr_elements) # scale to the original for i in range(len(dataset[attr_index])): if is_int[attr_index]: dataset[attr_index][i] = int(round( dataset[attr_index][i])) # rounding when needed else: dataset[attr_index][i] = round(dataset[attr_index][i], 4) morphed = map(list, zip(*dataset)) # recover to the original mode and finish. '''!!morph done!!''' if data_has_normalized: morphed = morphed[:-2] res = list() for x, dm in zip(morphed, data_matrix): row = list() tmp = 0 for attri, attr in enumerate(attribute_names): if attr in independent_attrs: row.append(x[tmp]) tmp += 1 elif attr == objective_attr: row.append(toolkit.str2num(dm[attri])) else: row.append(dm[attri]) res.append(row) return res
def MORPH(database, db_folder='not_from_csv_file', write_out_folder=None, db_has_normalized=False, effect_scope=[0, -1]): """ MORPH is a instance mutator which can shake the instance within the class boundary :param database: original data. :param db_folder: from where database fetch. by default, the database is a list of list :param write_out_folder: specify the writing out folder. None means no writing :param db_has_normalized: whether the database has been normalized :param effect_scope: specify the scope in the database to be morphed. Any data beyond the scope will remain the same :return: the morphed data """ alpha = settings.MORPH_alpha beta = settings.MORPH_beta # load the database if db_folder != 'not_from_csv_file': with open(db_folder + '/' + database + '.csv', 'r') as db: reader = csv.reader(db) attributes = next(reader) # including the last one--class tag dataset = [] for line in reader: dataset.append(line) dataset = [map(toolkit.str2num, row) for row in dataset] # str to numeric else: dataset = database attributes = ['foo'] * len(dataset[0]) # backup the no-need-to-morphed data if effect_scope[1] < 0: effect_scope[1] += len(dataset)+1 backup_data_set = copy.deepcopy(dataset[effect_scope[0]:effect_scope[1]]) if db_has_normalized: # adding two instance (all zeros and all ones) so that the normalization and de-normalization process # do not damage the original data dataset.append([0]*len(dataset[0])) dataset.append([1]*len(dataset[0])) is_int = [type(i) is int for i in dataset[0]] # save. for better representation of the output table classes = [row[-1] for row in dataset] # fetch the classes dataset = [row[:-1] for row in dataset] # separating the raw data and class '''dataset transposed mode begins...''' dataset = map(list, zip(*dataset)) # transpose. norm_funcs = [] denorm_funcs = [] # normalizing for attr_index, attr_elements in enumerate(dataset): # for each attribute elements f1, f2 = toolkit.attr_norm(attr_elements) norm_funcs.append(f1) denorm_funcs.append(f2) dataset[attr_index] = map(f1, attr_elements) '''dataset mode recover...''' dataset = map(list, zip(*dataset)) # transpose again. for row_index, row in enumerate(dataset): # for each row heterogeneous_index = [i for i in range(len(classes)) if classes[i] != classes[row_index]] boundary_dist = min([toolkit.euclidean_dist(row, dataset[heg]) for heg in heterogeneous_index]) boundary_dist /= math.sqrt(len(attributes)-1) for i in range(len(row)): dataset[row_index][i] += boundary_dist*random.uniform(alpha, beta)*random.choice([1, -1]) # shake '''dataset transposed mode begins...''' dataset = map(list, zip(*dataset)) # transpose. for attr_index, attr_elements in enumerate(dataset): # for each attribute elements dataset[attr_index] = map(denorm_funcs[attr_index], attr_elements) # scale to the original for i in range(len(dataset[attr_index])): if is_int[attr_index]: dataset[attr_index][i] = int(round(dataset[attr_index][i])) # rounding when needed morphed = map(list, zip(*dataset)) # recover to the original mode and finish. '''!!MORPH done!!''' # writing out and output the results for row_index in range(len(morphed)): morphed[row_index].append(classes[row_index]) if db_folder != 'not_from_csv_file': morphed.insert(0, attributes) if write_out_folder: with open(write_out_folder + '/'+database+'.csv', 'wb') as f: writer = csv.writer(f) writer.writerows(morphed) if db_has_normalized: morphed = morphed[:-2] # recover the unmorphed data for backup, row_index in zip(backup_data_set, range(effect_scope[0], effect_scope[1])): morphed[row_index] = backup return morphed