Example #1
0
def get_feature_scaling(graphs,
                        targets,
                        decomposition_funcs=None,
                        preprocessors=None,
                        nbits=11,
                        threshold=0.25):

    x = vectorize(graphs,
                  decomposition_funcs,
                  preprocessors=preprocessors,
                  nbits=nbits,
                  seed=1)
    estimator = SGDClassifier(penalty='elasticnet', tol=1e-3)
    fs = RFECV(estimator, step=.1, cv=3)
    fs.fit(x, targets)
    fs.estimator_.decision_function(fs.transform(x)).reshape(-1)
    importances = fs.inverse_transform(fs.estimator_.coef_).reshape(-1)
    signs = np.sign(importances)
    importances = np.absolute(importances)
    importances = importances / np.max(importances)
    # non linear thresholding to remove least important features
    th = np.percentile(importances, threshold * 100)
    signs[importances < th] = 0
    importances[importances < th] = 0
    return importances, signs
Example #2
0
 def feature_importance(self, pos_graphs, neg_graphs):
     graphs = pos_graphs + neg_graphs
     y = [1] * len(pos_graphs) + [-1] * len(neg_graphs)
     x = vectorize_graphs(graphs,
                          encoding_func=self.encoding_func,
                          feature_size=self.feature_size)
     estimator = SGDClassifier(penalty='elasticnet', tol=1e-3)
     fs = RFECV(estimator, step=.1, cv=3)
     fs.fit(x, y)
     fs.estimator_.decision_function(fs.transform(x)).reshape(-1)
     self.estimator = fs.estimator_
     importances = fs.inverse_transform(fs.estimator_.coef_).reshape(-1)
     intercept = fs.estimator_.intercept_[0]
     importance_dict = dict(enumerate(importances))
     return importance_dict, intercept
Example #3
0
    def fit(self, X, y, sample_weight=None, check_input=True, X_idx_sorted=None):
        # Store the original feature list and normalize the data
        list_temp = self.feature_list
        scaler = StandardScaler()
        X_minmax = scaler.fit_transform(X)
        self.X_minmax = copy.deepcopy(X_minmax)
        self.scores = []

        # Determine the number of folds to be used.
        kfold = StratifiedKFold(n_splits=5, shuffle=True)

        for outer in range(self.outer_loop):
            print("\n--------This is outer loop {}---------\n".format(outer + 1))
            # Run the outer loop from here
            for i, (train_o, test_o) in enumerate(kfold.split(X_minmax, y)):
                self.loop_indices.append((train_o, test_o))
                print("This is set {}".format(i + 1))
                X_train_o = X_minmax[train_o]
                y_train_o = y[train_o]
                X_test_o = X_minmax[test_o]
                y_test_o = y[test_o]
                X_train_transformed = copy.deepcopy(X_train_o)
                X_test_transformed = copy.deepcopy(X_test_o)

                # Run the inner loop from here
                for inner in range(self.inner_loop):
                    # If the number of features are very high (>100), we set the minimum number of features needed to be 100.
                    # If the numnber of features are moderate (15-100), we set the minimum number of features to be 10
                    # less than already present
                    n_feat = min(100, X_train_transformed.shape[1] - 10)

                    # If the number of features are less (<15), then we want it to select atleast 5 features to continue the loop
                    n_feat = max(10, n_feat)
                    list_temp_prev = list_temp
                    print("\n\t--------This is inner loop {}---------\n".format(inner + 1))
                    rfecv = RFECV(estimator=self.clf, step=1, min_features_to_select=n_feat, cv=kfold, scoring='accuracy')
                    # rfecv = xgb.XGBClassifier()

                    # Transform the datasets at each loop to keep track of reduced features
                    # rfecv.fit(X_train_transformed, y_train_o)
                    # X_train_transformed = rfecv.transform(X_train_transformed)
                    X_train_transformed = rfecv.fit_transform(X_train_transformed, y_train_o)
                    self.models.append(rfecv)
                    X_test_transformed = rfecv.transform(X_test_transformed)
                    X_minmax = rfecv.transform(X_minmax)
                    features = rfecv.n_features_
                    print("\tShape of transformed train dataset is: {}".format(X_train_transformed.shape))
                    print("\tOptimal no. of features are: {}".format(features))
                    ranking = rfecv.ranking_

                    # Update the feature list here
                    list_temp = self.updateFeatures(list_temp_prev, ranking)

                # This is just used to check the score after inner loop is finished as the test data was already transformed
                # to reduced features. Hence we inverse the transform to check the score
                X_temp = rfecv.inverse_transform(X_test_transformed)
                score = rfecv.score(X_temp, y_test_o)
                self.scores.append(score)
                print("Shape of transformed train dataset is: {}".format(X_train_transformed.shape))
                print("Shape of ranks is: {}\n\n".format(ranking.shape))

        # Print the average scores after finshing the outer loop and save the features in an excel file
        print("After outer loop CV, mean score is: {}".format(mean(self.scores)))
        self.list = list_temp_prev
        self.ranking = ranking
        print(X_train_transformed.shape)
        print(X_test_transformed.shape)
        self.X_transformed = np.vstack((X_train_transformed, X_test_transformed))

        return self
    f"acc_train = {acc_train:.3f}; f1score_train = {f1score_train}\nacc_validation = {acc_validation:.8f}; f1score_validaton = {f1score_validation}"
)

#%% ============================最终的测试============================
# 最好使用外部测试集
pred_test_label = model.predict(feature_test_)
pred_test_prob = model.decision_function(feature_test_)
acc_test = metrics.accuracy_score(label_test, pred_test_label)
f1score_test = metrics.f1_score(label_test, pred_test_label)
print(f"acc_test = {acc_test:.8f}; f1score_test = {f1score_test}\n")

#%% ============================结果可视化============================
# 获取权重
wei = model.coef_
wei = (wei - wei.mean()) / wei.std()
wei = selector.inverse_transform(wei)
wei = pca.inverse_transform(wei)
weight = np.zeros(mask.shape)
weight[mask] = wei[0]
weight = weight + weight.T

# 只显示前0.2%的权重
threshold = 99.8
topperc = np.percentile(np.abs(weight), threshold)
weight[np.abs(weight) < topperc] = 0

# 获取MNI坐标
coords_file = r"F:\workshop\demo_data\BNA_subregions.xlsx"
coords_info = pd.read_excel(coords_file)

# 获取MNI坐标