예제 #1
0
 def create_matrix(self, data, label):
     index = {l: i for i, l in enumerate(np.unique(label))}
     matrix = None
     labels_to_agg = np.unique(label)
     labels_to_agg_list = [[x] for x in labels_to_agg]
     label_dict = {
         labels_to_agg[value]: value
         for value in range(labels_to_agg.shape[0])
     }
     num_of_length = len(labels_to_agg_list)
     class_1_variety = []
     class_2_variety = []
     while len(labels_to_agg_list) > 1:
         score_result = np.inf
         for i in range(0, len(labels_to_agg_list) - 1):
             for j in range(i + 1, len(labels_to_agg_list)):
                 class_1_data, class_1_label = _get_data_subset(
                     data, label, labels_to_agg_list[i])
                 class_2_data, class_2_label = _get_data_subset(
                     data, label, labels_to_agg_list[j])
                 score = Criterion.agg_score(
                     class_1_data,
                     class_1_label,
                     class_2_data,
                     class_2_label,
                     score=Criterion.max_distance_score)
                 if score < score_result:
                     score_result = score
                     class_1_variety = labels_to_agg_list[i]
                     class_2_variety = labels_to_agg_list[j]
         new_col = np.zeros((num_of_length, 1))
         for i in class_1_variety:
             new_col[label_dict[i]] = 1
         for i in class_2_variety:
             new_col[label_dict[i]] = -1
         if matrix is None:
             matrix = new_col
         else:
             matrix = np.hstack((matrix, new_col))
         new_class = class_1_variety + class_2_variety
         labels_to_agg_list.remove(class_1_variety)
         labels_to_agg_list.remove(class_2_variety)
         labels_to_agg_list.insert(0, new_class)
     return matrix, index
예제 #2
0
 def create_matrix(self, data, label):
     index = {l: i for i, l in enumerate(np.unique(label))}
     matrix = None
     labels_to_divide = [np.unique(label)]
     while len(labels_to_divide) > 0:
         label_set = labels_to_divide.pop(0)
         datas, labels = _get_data_subset(data, label, label_set)
         class_1_variety_result, class_2_variety_result = sffs(
             datas, labels)
         new_col = np.zeros((len(index), 1))
         for i in class_1_variety_result:
             new_col[index[i]] = 1
         for i in class_2_variety_result:
             new_col[index[i]] = -1
         if matrix is None:
             matrix = copy.copy(new_col)
         else:
             matrix = np.hstack((matrix, new_col))
         if len(class_1_variety_result) > 1:
             labels_to_divide.append(class_1_variety_result)
         if len(class_2_variety_result) > 1:
             labels_to_divide.append(class_2_variety_result)
     return matrix, index
예제 #3
0
 def create_matrix(self, train_data, train_label, validate_data,
                   validate_label, estimator, **param):
     index = {l: i for i, l in enumerate(np.unique(train_label))}
     matrix = None
     predictors = []
     predictor_weights = []
     labels_to_divide = [np.unique(train_label)]
     while len(labels_to_divide) > 0:
         label_set = labels_to_divide.pop(0)
         label_count = len(label_set)
         groups = combinations(range(label_count),
                               np.int(np.ceil(label_count / 2)))
         score_result = 0
         est_result = None
         for group in groups:
             class_1_variety = np.array([label_set[i] for i in group])
             class_2_variety = np.array(
                 [l for l in label_set if l not in class_1_variety])
             class_1_data, class_1_label = _get_data_subset(
                 train_data, train_label, class_1_variety)
             class_2_data, class_2_label = _get_data_subset(
                 train_data, train_label, class_2_variety)
             class_1_cla = np.ones(len(class_1_data))
             class_2_cla = -np.ones(len(class_2_data))
             train_d = np.vstack((class_1_data, class_2_data))
             train_c = np.hstack((class_1_cla, class_2_cla))
             est = estimator(**param).fit(train_d, train_c)
             class_1_data, class_1_label = _get_data_subset(
                 validate_data, validate_label, class_1_variety)
             class_2_data, class_2_label = _get_data_subset(
                 validate_data, validate_label, class_2_variety)
             class_1_cla = np.ones(len(class_1_data))
             class_2_cla = -np.ones(len(class_2_data))
             validation_d = np.array([])
             validation_c = np.array([])
             try:
                 validation_d = np.vstack((class_1_data, class_2_data))
                 validation_c = np.hstack((class_1_cla, class_2_cla))
             except Exception:
                 if len(class_1_data) > 0:
                     validation_d = class_1_data
                     validation_c = class_1_cla
                 elif len(class_2_data) > 0:
                     validation_d = class_2_data
                     validation_c = class_2_cla
             if validation_d.shape[0] > 0 and validation_d.shape[1] > 0:
                 score = est.score(validation_d, validation_c)
             else:
                 score = 0.8
             if score >= score_result:
                 score_result = score
                 est_result = est
                 class_1_variety_result = class_1_variety
                 class_2_variety_result = class_2_variety
         new_col = np.zeros((len(index), 1))
         for i in class_1_variety_result:
             new_col[index[i]] = 1
         for i in class_2_variety_result:
             new_col[index[i]] = -1
         if matrix is None:
             matrix = copy.copy(new_col)
         else:
             matrix = np.hstack((matrix, new_col))
         predictors.append(est_result)
         predictor_weights.append(_estimate_weight(1 - score_result))
         if len(class_1_variety_result) > 1:
             labels_to_divide.append(class_1_variety_result)
         if len(class_2_variety_result) > 1:
             labels_to_divide.append(class_2_variety_result)
     return matrix, index, predictors, predictor_weights
예제 #4
0
 def create_matrix(self, data, label):
     index = {l: i for i, l in enumerate(np.unique(label))}
     matrix = None
     labels_to_divide = [np.unique(label)]
     while len(labels_to_divide) > 0:
         label_set = labels_to_divide.pop(0)
         datas, labels = _get_data_subset(data, label, label_set)
         class_1_variety_result, class_2_variety_result = sffs(
             datas, labels, score=Criterion.max_center_distance_score)
         class_1_data_result, class_1_label_result = _get_data_subset(
             data, label, class_1_variety_result)
         class_2_data_result, class_2_label_result = _get_data_subset(
             data, label, class_2_variety_result)
         class_1_center_result = np.average(class_1_data_result, axis=0)
         class_2_center_result = np.average(class_2_data_result, axis=0)
         belong_to_class_1 = [
             euclidean_distance(x, class_1_center_result) <=
             euclidean_distance(x, class_2_center_result)
             for x in class_1_data_result
         ]
         belong_to_class_2 = [
             euclidean_distance(x, class_2_center_result) <=
             euclidean_distance(x, class_1_center_result)
             for x in class_2_data_result
         ]
         class_1_true_num = {k: 0 for k in class_1_variety_result}
         class_2_true_num = {k: 0 for k in class_2_variety_result}
         for y in class_1_label_result[belong_to_class_1]:
             class_1_true_num[y] += 1
         for y in class_2_label_result[belong_to_class_2]:
             class_2_true_num[y] += 1
         class_1_label_count = {
             k: list(class_1_label_result).count(k)
             for k in class_1_variety_result
         }
         class_2_label_count = {
             k: list(class_2_label_result).count(k)
             for k in class_2_variety_result
         }
         class_1_ratio = {
             k: class_1_true_num[k] / class_1_label_count[k]
             for k in class_1_variety_result
         }
         class_2_ratio = {
             k: -class_2_true_num[k] / class_2_label_count[k]
             for k in class_2_variety_result
         }
         new_col = np.zeros((len(index), 1))
         for i in class_1_ratio:
             new_col[index[i]] = class_1_ratio[i]
         for i in class_2_ratio:
             new_col[index[i]] = class_2_ratio[i]
         if matrix is None:
             matrix = copy.copy(new_col)
         else:
             matrix = np.hstack((matrix, new_col))
         if len(class_1_variety_result) > 1:
             labels_to_divide.append(class_1_variety_result)
         if len(class_2_variety_result) > 1:
             labels_to_divide.append(class_2_variety_result)
     return matrix, index
예제 #5
0
파일: SFFS.py 프로젝트: YannisYxn/TCGA-ECOC
def sffs(data, labels, judge_score=Criterion.divide_score, **param):
    """
    Sequential floating forward searching(SFFS) method
    :param data: data
    :param labels: label
    :param judge_score: a callable object to evaluate the score for partition
    :param param: params for judge_score
    :return: partition labels
    """
    target_label = []
    other_label = []
    best_target_label = []
    best_other_label = []
    unique_label = np.unique(labels)
    score_list = {}
    target_label_list = []
    target_label_list_pre_len = 0
    K = 0
    pre_K = None
    while True:
        best_score = -np.inf
        update_flag = 0
        for label in unique_label:
            if label in target_label:
                continue
            target_label.append(label)
            if target_label in target_label_list:
                continue
            other_label = list(unique_label)
            for i in target_label:
                other_label.remove(i)
            target_data, target_labels = _get_data_subset(
                data, labels, target_label)
            other_data, other_labels = _get_data_subset(
                data, labels, other_label)
            score = judge_score(target_data, target_labels, other_data,
                                other_labels, **param)
            if score > best_score:
                best_score = score
                best_target_label = list(target_label)
                best_other_label = list(other_label)
                update_flag = 1
            target_label.pop()
        if update_flag:
            score_list[K + 1] = best_score
            K = K + 1
            target_label = list(best_target_label)
            target_label_list.append(list(target_label))
            other_label = list(best_other_label)
        else:
            break
        while True:
            best_score = -np.inf
            if len(target_label) < 2:
                break
            for label in target_label:
                target_label_temp = list(target_label)
                target_label_temp.remove(label)
                if target_label_temp in target_label_list:
                    continue
                other_label_temp = list(other_label)
                other_label_temp.append(label)
                target_data, target_labels = _get_data_subset(
                    data, labels, target_label_temp)
                other_data, other_labels = _get_data_subset(
                    data, labels, other_label_temp)
                score = judge_score(target_data, target_labels, other_data,
                                    other_labels)
                if score > best_score:
                    best_score = score
                    best_target_label = list(target_label_temp)
                    best_other_label = list(other_label_temp)
            if K - 1 > 0 and best_score > score_list[K]:
                score_list[K + 1] = best_score
                target_label = list(best_target_label)
                target_label_list.append(list(target_label))
                other_label = list(best_other_label)
                K = K + 1
            else:
                break
        if K - 1 > 0 and score_list[K] - score_list[pre_K] < 0.00001:
            break
        if len(target_label) >= len(unique_label) - 1:
            break
        if target_label_list_pre_len == len(target_label_list):
            break
        target_label_list_pre_len = len(target_label)
        pre_K = K
    try:
        return target_label_list[pre_K - 1], [
            label for label in unique_label
            if label not in target_label_list[pre_K - 1]
        ]
    except TypeError:
        return target_label, other_label