def create_matrix(self, data, label): index = {l: i for i, l in enumerate(np.unique(label))} matrix = None labels_to_agg = np.unique(label) labels_to_agg_list = [[x] for x in labels_to_agg] label_dict = { labels_to_agg[value]: value for value in range(labels_to_agg.shape[0]) } num_of_length = len(labels_to_agg_list) class_1_variety = [] class_2_variety = [] while len(labels_to_agg_list) > 1: score_result = np.inf for i in range(0, len(labels_to_agg_list) - 1): for j in range(i + 1, len(labels_to_agg_list)): class_1_data, class_1_label = _get_data_subset( data, label, labels_to_agg_list[i]) class_2_data, class_2_label = _get_data_subset( data, label, labels_to_agg_list[j]) score = Criterion.agg_score( class_1_data, class_1_label, class_2_data, class_2_label, score=Criterion.max_distance_score) if score < score_result: score_result = score class_1_variety = labels_to_agg_list[i] class_2_variety = labels_to_agg_list[j] new_col = np.zeros((num_of_length, 1)) for i in class_1_variety: new_col[label_dict[i]] = 1 for i in class_2_variety: new_col[label_dict[i]] = -1 if matrix is None: matrix = new_col else: matrix = np.hstack((matrix, new_col)) new_class = class_1_variety + class_2_variety labels_to_agg_list.remove(class_1_variety) labels_to_agg_list.remove(class_2_variety) labels_to_agg_list.insert(0, new_class) return matrix, index
def create_matrix(self, data, label): index = {l: i for i, l in enumerate(np.unique(label))} matrix = None labels_to_divide = [np.unique(label)] while len(labels_to_divide) > 0: label_set = labels_to_divide.pop(0) datas, labels = _get_data_subset(data, label, label_set) class_1_variety_result, class_2_variety_result = sffs( datas, labels) new_col = np.zeros((len(index), 1)) for i in class_1_variety_result: new_col[index[i]] = 1 for i in class_2_variety_result: new_col[index[i]] = -1 if matrix is None: matrix = copy.copy(new_col) else: matrix = np.hstack((matrix, new_col)) if len(class_1_variety_result) > 1: labels_to_divide.append(class_1_variety_result) if len(class_2_variety_result) > 1: labels_to_divide.append(class_2_variety_result) return matrix, index
def create_matrix(self, train_data, train_label, validate_data, validate_label, estimator, **param): index = {l: i for i, l in enumerate(np.unique(train_label))} matrix = None predictors = [] predictor_weights = [] labels_to_divide = [np.unique(train_label)] while len(labels_to_divide) > 0: label_set = labels_to_divide.pop(0) label_count = len(label_set) groups = combinations(range(label_count), np.int(np.ceil(label_count / 2))) score_result = 0 est_result = None for group in groups: class_1_variety = np.array([label_set[i] for i in group]) class_2_variety = np.array( [l for l in label_set if l not in class_1_variety]) class_1_data, class_1_label = _get_data_subset( train_data, train_label, class_1_variety) class_2_data, class_2_label = _get_data_subset( train_data, train_label, class_2_variety) class_1_cla = np.ones(len(class_1_data)) class_2_cla = -np.ones(len(class_2_data)) train_d = np.vstack((class_1_data, class_2_data)) train_c = np.hstack((class_1_cla, class_2_cla)) est = estimator(**param).fit(train_d, train_c) class_1_data, class_1_label = _get_data_subset( validate_data, validate_label, class_1_variety) class_2_data, class_2_label = _get_data_subset( validate_data, validate_label, class_2_variety) class_1_cla = np.ones(len(class_1_data)) class_2_cla = -np.ones(len(class_2_data)) validation_d = np.array([]) validation_c = np.array([]) try: validation_d = np.vstack((class_1_data, class_2_data)) validation_c = np.hstack((class_1_cla, class_2_cla)) except Exception: if len(class_1_data) > 0: validation_d = class_1_data validation_c = class_1_cla elif len(class_2_data) > 0: validation_d = class_2_data validation_c = class_2_cla if validation_d.shape[0] > 0 and validation_d.shape[1] > 0: score = est.score(validation_d, validation_c) else: score = 0.8 if score >= score_result: score_result = score est_result = est class_1_variety_result = class_1_variety class_2_variety_result = class_2_variety new_col = np.zeros((len(index), 1)) for i in class_1_variety_result: new_col[index[i]] = 1 for i in class_2_variety_result: new_col[index[i]] = -1 if matrix is None: matrix = copy.copy(new_col) else: matrix = np.hstack((matrix, new_col)) predictors.append(est_result) predictor_weights.append(_estimate_weight(1 - score_result)) if len(class_1_variety_result) > 1: labels_to_divide.append(class_1_variety_result) if len(class_2_variety_result) > 1: labels_to_divide.append(class_2_variety_result) return matrix, index, predictors, predictor_weights
def create_matrix(self, data, label): index = {l: i for i, l in enumerate(np.unique(label))} matrix = None labels_to_divide = [np.unique(label)] while len(labels_to_divide) > 0: label_set = labels_to_divide.pop(0) datas, labels = _get_data_subset(data, label, label_set) class_1_variety_result, class_2_variety_result = sffs( datas, labels, score=Criterion.max_center_distance_score) class_1_data_result, class_1_label_result = _get_data_subset( data, label, class_1_variety_result) class_2_data_result, class_2_label_result = _get_data_subset( data, label, class_2_variety_result) class_1_center_result = np.average(class_1_data_result, axis=0) class_2_center_result = np.average(class_2_data_result, axis=0) belong_to_class_1 = [ euclidean_distance(x, class_1_center_result) <= euclidean_distance(x, class_2_center_result) for x in class_1_data_result ] belong_to_class_2 = [ euclidean_distance(x, class_2_center_result) <= euclidean_distance(x, class_1_center_result) for x in class_2_data_result ] class_1_true_num = {k: 0 for k in class_1_variety_result} class_2_true_num = {k: 0 for k in class_2_variety_result} for y in class_1_label_result[belong_to_class_1]: class_1_true_num[y] += 1 for y in class_2_label_result[belong_to_class_2]: class_2_true_num[y] += 1 class_1_label_count = { k: list(class_1_label_result).count(k) for k in class_1_variety_result } class_2_label_count = { k: list(class_2_label_result).count(k) for k in class_2_variety_result } class_1_ratio = { k: class_1_true_num[k] / class_1_label_count[k] for k in class_1_variety_result } class_2_ratio = { k: -class_2_true_num[k] / class_2_label_count[k] for k in class_2_variety_result } new_col = np.zeros((len(index), 1)) for i in class_1_ratio: new_col[index[i]] = class_1_ratio[i] for i in class_2_ratio: new_col[index[i]] = class_2_ratio[i] if matrix is None: matrix = copy.copy(new_col) else: matrix = np.hstack((matrix, new_col)) if len(class_1_variety_result) > 1: labels_to_divide.append(class_1_variety_result) if len(class_2_variety_result) > 1: labels_to_divide.append(class_2_variety_result) return matrix, index
def sffs(data, labels, judge_score=Criterion.divide_score, **param): """ Sequential floating forward searching(SFFS) method :param data: data :param labels: label :param judge_score: a callable object to evaluate the score for partition :param param: params for judge_score :return: partition labels """ target_label = [] other_label = [] best_target_label = [] best_other_label = [] unique_label = np.unique(labels) score_list = {} target_label_list = [] target_label_list_pre_len = 0 K = 0 pre_K = None while True: best_score = -np.inf update_flag = 0 for label in unique_label: if label in target_label: continue target_label.append(label) if target_label in target_label_list: continue other_label = list(unique_label) for i in target_label: other_label.remove(i) target_data, target_labels = _get_data_subset( data, labels, target_label) other_data, other_labels = _get_data_subset( data, labels, other_label) score = judge_score(target_data, target_labels, other_data, other_labels, **param) if score > best_score: best_score = score best_target_label = list(target_label) best_other_label = list(other_label) update_flag = 1 target_label.pop() if update_flag: score_list[K + 1] = best_score K = K + 1 target_label = list(best_target_label) target_label_list.append(list(target_label)) other_label = list(best_other_label) else: break while True: best_score = -np.inf if len(target_label) < 2: break for label in target_label: target_label_temp = list(target_label) target_label_temp.remove(label) if target_label_temp in target_label_list: continue other_label_temp = list(other_label) other_label_temp.append(label) target_data, target_labels = _get_data_subset( data, labels, target_label_temp) other_data, other_labels = _get_data_subset( data, labels, other_label_temp) score = judge_score(target_data, target_labels, other_data, other_labels) if score > best_score: best_score = score best_target_label = list(target_label_temp) best_other_label = list(other_label_temp) if K - 1 > 0 and best_score > score_list[K]: score_list[K + 1] = best_score target_label = list(best_target_label) target_label_list.append(list(target_label)) other_label = list(best_other_label) K = K + 1 else: break if K - 1 > 0 and score_list[K] - score_list[pre_K] < 0.00001: break if len(target_label) >= len(unique_label) - 1: break if target_label_list_pre_len == len(target_label_list): break target_label_list_pre_len = len(target_label) pre_K = K try: return target_label_list[pre_K - 1], [ label for label in unique_label if label not in target_label_list[pre_K - 1] ] except TypeError: return target_label, other_label