Ejemplo n.º 1
1
        def get_cascaded_sel_idx(high_th_year,
                                 low_th_year,
                                 feature_list,
                                 set_feature,
                                 sel_feature_num,
                                 div_ratio=4):
            high_risk_th = high_th_year * 365
            low_risk_th = low_th_year * 365
            high_risk_group, low_risk_group = helper.get_risk_group(
                x, c, s, high_risk_th, low_risk_th)
            #trn_x, trn_y, val_x, val_y = get_train_val(high_risk_group, low_risk_group)
            trn_x, trn_y = helper.get_train(
                high_risk_group,
                low_risk_group,
                is_categori_y=False,
                seed=self.random_seed)  #without validation set
            if len(set_feature):
                trn_x = trn_x[:, set_feature]
                #val_x = val_x[:,set_feature]
            feature_num = trn_x.shape[1]

            if sel_feature_num == 0:
                sel_gene_num = int(
                    max(sel_feature_num, feature_num / div_ratio))
            else:
                sel_gene_num = sel_feature_num

            sort_idx = fisher_score.fisher_score(trn_x, trn_y, mode='index')
            sel_idx = sort_idx[:sel_gene_num]

            return sel_idx
Ejemplo n.º 2
0
def fisherProc(X,y):
	# obtain the score of each feature on the training set
	score = fisher_score.fisher_score(X, y)

	# rank features in descending order according to score
	idx = fisher_score.feature_ranking(score)
	return idx
Ejemplo n.º 3
0
def fisher_score_ranking(num_features=None):
    # NOTE: For BigArt. Fisher Score ranking of all features. These features
    # are passed on to LGBM in best model.

    n = 10
    # Five vars: (15, 3.5)
    figsize = (15, 7)
    show = False

    path_to_figure = '../feature_importances/fisher_score_ranking.pdf'
    path_to_scores = './../../data_source/results/feature_importance/fisher_feat_ranks.npy'
    path_to_features = '../../data_source/to_analysis/compressed_features/all_features_orig_images_icc_dropped.csv'
    path_to_target = '../../data_source/to_analysis/target_dfs.csv'
    path_to_clinical = '../../data_source/to_analysis/clinical_params.csv'

    X = pd.read_csv(path_to_features,index_col=0)
    y = pd.read_csv(path_to_target, index_col=0)
    y = np.squeeze(y.values)

    clinical = pd.read_csv(path_to_clinical, index_col=0)

    scaler = StandardScaler()
    X_std = scaler.fit_transform(X)

    scores = fisher_score(X_std, y)
    np.save(path_to_scores, scores)

    ranks = extract_ranks(scores, X.columns, clinical)
    ranks.sort_values('ranks', ascending=False, inplace=True)

    plt.figure(figsize=figsize)
    plt.xlabel('Fisher Score')
    plot_feature_ranking(ranks, n=n, show=show, path_to_figure=path_to_figure)
    def fisher_feature_reduction(self,down,up) :
        
        #importance of attributes are measured and listed
        score = fisher_score.fisher_score(self.class_train.values[down:up,:-1], self.class_train.iloc[down:up,-1])

        #attributes are saved to dictionary with their importance value in cumulative way
        self.to_dict(score)

        return score
def fisher(data):
    rank = []
    for i in range(6):
        X = data[i][:, :-1]
        Y = data[i][:, -1]
        score = fisher_score.fisher_score(X, Y)
        idx1 = fisher_score.feature_ranking(score)
        idx = samp(idx1.tolist())
        rank.append(idx)
    R = rankaggregate(rank)
    return R
Ejemplo n.º 6
0
def fisher():
    before = datetime.datetime.now()
    result = fisher_score.fisher_score(data, labels, mode="index")
    after = datetime.datetime.now()
    print("Fisher")
    result = result[:treshold]
    print(len(result))
    print("cas: " + str(after - before))
    print('\n')
    if len(result) < len(header):
        transform_and_save(result, "Fisher")
Ejemplo n.º 7
0
def seleciona_caracteristicas(vetor_caracteristicas, classes):
	caracteristicas_selecionadas  = []
	limiar_consideracao = 0

	score = fisher_score.fisher_score(vetor_caracteristicas, classes)
	rank = fisher_score.feature_ranking(score)
	features_consideradas = conta_features_limiar(score, limiar_consideracao)
	if features_consideradas > 1:
		rank_considerado = rank[0:features_consideradas:1]
		caracteristicas_selecionadas = vetor_caracteristicas[:, rank_considerado]

	return caracteristicas_selecionadas, rank_considerado
Ejemplo n.º 8
0
def seleciona_caracteristicas(vetor_caracteristicas, classes):
    caracteristicas_selecionadas = []
    limiar_consideracao = 0

    score = fisher_score.fisher_score(vetor_caracteristicas, classes)
    rank = fisher_score.feature_ranking(score)
    features_consideradas = conta_features_limiar(score, limiar_consideracao)
    if features_consideradas > 1:
        rank_considerado = rank[0:features_consideradas:1]
        caracteristicas_selecionadas = vetor_caracteristicas[:,
                                                             rank_considerado]

    return caracteristicas_selecionadas, rank_considerado
Ejemplo n.º 9
0
 def get_sel_idx(high_th_year, low_th_year, feature_list,
                 sel_feature_num):
     high_risk_th = high_th_year * 365
     low_risk_th = low_th_year * 365
     high_risk_group, low_risk_group = helper.get_risk_group(
         x, c, s, high_risk_th, low_risk_th)
     trn_x, trn_y = helper.get_train(
         high_risk_group,
         low_risk_group,
         is_categori_y=False,
         seed=self.random_seed)  #without validation set
     sort_idx = fisher_score.fisher_score(trn_x, trn_y, mode='index')
     #sort_idx = f_score.f_score(trn_x, trn_y, mode='index')
     return sort_idx[:sel_feature_num]
Ejemplo n.º 10
0
    def get_fisher_scores(self, max_dim):
        """ Получить меру Фишера и качество распознавания на основе AUC ROC.

        Выполняется отбор признаков для размерностей пространства признаков от 1 до max_dim. Для каждой размерности
        выполняется перекрестная проверка (cross-validation) и вычисляется интегральное значение меры Фишера и
        среднее по всем подвыборкам значение меры AUC ROC.

        Args:

            max_dim(int): число признаков до которого следует производить отбор.

        Returns:

            fisher_summary_scores: - вычисленные суммарные значения меры Фишера.
            auc_roc_scores: - вычисленные значения площади под кривой ROC.

        """

        x_train = scale(self.features)  # normalize features
        y_train = self.targets  # target ids
        # Fisher score estimation
        f_score = fisher_score.fisher_score(
            x_train, y_train)  # calculate Fisher score value
        ranked_f_score = fisher_score.feature_ranking(f_score)  # rank features
        print('Последовательность отобранных коэффициентов:')
        print(*list(self.feature_header[ranked_f_score[0:max_dim]]), sep=', ')
        fisher_summary_scores = list(
            it.accumulate(
                f_score[ranked_f_score[0:max_dim]]))  # integral Fisher scores
        # Cross validation
        k_fold = KFold(n_splits=5,
                       shuffle=True)  # setup cross-validation pattern
        ar_scorer = make_scorer(roc_auc_score)  # make scorer
        clf = SGDRegressor(max_iter=100, tol=1e-3, random_state=241
                           )  # stochastic gradient descend regression as a clf
        auc_roc_scores = []  # list for AUC ROC values
        for i in range(1,
                       max_dim + 1):  # iterate by number of features selected
            features = x_train[:, ranked_f_score[0:i]]  # select features
            t = y_train
            vect_auc_roc_score = cross_val_score(clf,
                                                 features,
                                                 t,
                                                 scoring=ar_scorer,
                                                 cv=k_fold)  # train
            auc_roc_scores.append(np.mean(vect_auc_roc_score)
                                  )  # add mean (over CV-subsets) AUC ROC value

        return fisher_summary_scores, auc_roc_scores
Ejemplo n.º 11
0
def run_fold(trial,P,X,y,method,dataset,parttype):
    print 'Obtaining features for %s %s %s fold: %2d' % (parttype,method,dataset,trial)
    n_samples, n_features = X.shape
    train = P[:,trial] == 1
    trnX = X[train]
    trnY = y[train]

    start_time = time.time()
    if method == 'fisher': 
        score = fisher_score.fisher_score(trnX,trnY)
        features = fisher_score.feature_ranking(score)
    elif method == 'chi2':
        score = chi_square.chi_square(trnX,trnY)
        features = chi_square.feature_ranking(score)
    elif method == 'relieff':
        score = reliefF.reliefF(trnX,trnY)
        features = reliefF.feature_ranking(score)
    elif method == 'jmi':
        features = JMI.jmi(trnX,trnY,  n_selected_features=n_features)
    elif method == 'mrmr':
        features = MRMR.mrmr(trnX,trnY,n_selected_features=n_features)
    elif method == 'infogain':
        features = MIM.mim(trnX,trnY,n_selected_features=n_features)
    elif method == 'svmrfe':
        features = svmrfe(trnX,trnY)
    elif method == 'hdmr':
        sobol_set_all = scipy.io.loadmat('sobol_set.mat')
        sobol_set     = sobol_set_all['sobol_set']
        sobol_set     = sobol_set.astype(float)
        params = {'sobol_set':sobol_set,'k':1,'p':3,'M':1000,'b':'L'}
        models  = hdmrlearn(trnX,trnY,params)
        features,w = hdmrselect(X,models)
    elif method == 'hdmrhaar':
        sobol_set_all = scipy.io.loadmat('sobol_set.mat')
        sobol_set     = sobol_set_all['sobol_set']
        sobol_set     = sobol_set.astype(float)
        params = {'sobol_set':sobol_set,'k':1,'p':255,'M':1000,'b':'H'}
        models  = hdmrlearn(trnX,trnY,params)
        features,w = hdmrselect(X,models)
    else:
        print(method + 'does no exist')

    cputime = time.time() - start_time
    print features
    print 'cputime %f' % cputime
    return {'features': features, 'cputime': cputime}
def get_fisher_score(data,label,k = 30):
    score = fisher_score.fisher_score(data, label)
    #print(score)
    ranking = fisher_score.feature_ranking(score)
    #print(idx)
    
    
    dfscores = pd.DataFrame(score)
    dfcolumns = pd.DataFrame(data.columns)
    #df_rank =pd.DataFrame(idx)
    
    featureScores = pd.concat([dfcolumns,dfscores],axis=1)
    featureScores.columns = ['Feature','Score']  #naming the dataframe columns
    #print(featureScores.nlargest(k,'Score'))  #print 20 best features
    result = featureScores.nlargest(k,'Score')
    
    return result, ranking
def weight():
#    x_train, datamat, y_train,labelmat = cross_validation.train_test_split(comtest.iloc[0:len(comtest),1:comtest.shape[1]-1],comtest.iloc[0:len(comtest),-1], test_size = 0.2,random_state = j) 
#    datamat=np.array(datamat,dtype=np.float)
#    labelmat=np.array(labelmat,dtype=np.int)
    datamat=np.array(comtest.iloc[0:len(comtest),1:comtest.shape[1]-1],dtype=np.float)  #提取病例数据及其标签
    labelmat=np.array(comtest.iloc[0:len(comtest),-1],dtype=np.int)
    datamat=preprocess(datamat)
    for i in range(len(labelmat)):
        if labelmat[i]==0:
            labelmat[i]=-1;#adaboost只能区分-1和1的标签
            
    Relief = reliefF.reliefF(datamat, labelmat)   #计算Relieff下的特征权重
    print('Relief, 第%s次验证 '%(1))
    Fisher= fisher_score.fisher_score(datamat, labelmat)  #计算fisher下的特征权重
    print('Fisher, 第%s次验证 '%(1))
    gini= gini_index.gini_index(datamat,labelmat)  #计算gini下的特征权重
    gini=-gini
    print('gini, 第%s次验证 '%(1))
    print("done_ %s" )
    return Relief, Fisher, gini
def naiveBayes(processed_train_features, processed_valid_features,
               train_labels, valid_labels, processed_test_features,
               test_labels):
    model1 = GaussianNB()
    model1.fit(processed_train_features, train_labels)
    naive_bayes_predict_train = model1.predict(processed_train_features)
    naive_bayes_predict_valid = model1.predict(processed_valid_features)
    #print("Naive Bayes Training accuracy ",accuracy_score(train_labels, naive_bayes_predict_train))
    print("Naive Bayes Valid accuracy ",
          accuracy_score(valid_labels, naive_bayes_predict_valid))
    naive_bayes_predict_train_before_fisher = model1.predict(
        processed_test_features)
    print("Naive Bayes Testing accuracy ",
          accuracy_score(test_labels, naive_bayes_predict_train_before_fisher))
    XFisher = processed_test_features.to_numpy()
    score = fs.fisher_score(XFisher, test_labels)
    ranked_featrues = fs.feature_ranking(score)
    topFeatures = ranked_featrues[:50]
    print(topFeatures)
    print(score.shape)
    print(XFisher.shape)
    intersection_cols = topFeatures
    colnamelist = []
    for i in topFeatures:
        colname = processed_train_features.columns[i]
        colnamelist.append(colname)
    test = processed_test_features.copy()
    valid_for_bayes = processed_valid_features.copy()
    size = 188
    test.drop(test.columns.difference(colnamelist), 1, inplace=True)
    valid_for_bayes.drop(valid_for_bayes.columns.difference(colnamelist),
                         1,
                         inplace=True)
    model = GaussianNB()
    model.fit(test, test_labels)
    naive_bayes_predict_train_after_fisher = model.predict(test)
    print("Naive Bayes Testing accuracy ",
          accuracy_score(test_labels, naive_bayes_predict_train_after_fisher))
    naive_bayes_predict_valid_after_fisher = model.predict(valid_for_bayes)
    print("Naive Bayes Validation accuracy",
          accuracy_score(valid_labels, naive_bayes_predict_valid_after_fisher))
Ejemplo n.º 15
0
def main():
    # load data
    mat = scipy.io.loadmat("../data/COIL20.mat")
    X = mat["X"]  # data
    X = X.astype(float)
    y = mat["Y"]  # label
    y = y[:, 0]
    n_samples, n_features = X.shape  # number of samples and number of features

    # split data into 10 folds
    ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    num_fea = 100  # number of selected features
    clf = svm.LinearSVC()  # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the score of each feature on the training set
        score = fisher_score.fisher_score(X[train], y[train])

        # rank features in descending order according to score
        idx = fisher_score.feature_ranking(score)

        # obtain the dataset on the selected features
        selected_features = X[:, idx[0:num_fea]]

        # train a classification model with the selected features on the training dataset
        clf.fit(selected_features[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(selected_features[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print "Accuracy:", float(correct) / 10
Ejemplo n.º 16
0
def main():
    # load data
    mat = scipy.io.loadmat('../data/COIL20.mat')
    X = mat['X']  # data
    X = X.astype(float)
    y = mat['Y']  # label
    y = y[:, 0]
    n_samples, n_features = X.shape  # number of samples and number of features

    # split data into 10 folds
    ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)

    # perform evaluation on classification task
    num_fea = 100  # number of selected features
    clf = svm.LinearSVC()  # linear SVM

    correct = 0
    for train, test in ss:
        # obtain the score of each feature on the training set
        score = fisher_score.fisher_score(X[train], y[train])

        # rank features in descending order according to score
        idx = fisher_score.feature_ranking(score)

        # obtain the dataset on the selected features
        selected_features = X[:, idx[0:num_fea]]

        # train a classification model with the selected features on the training dataset
        clf.fit(selected_features[train], y[train])

        # predict the class labels of test data
        y_predict = clf.predict(selected_features[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print('Accuracy:', old_div(float(correct), 10))
    def rank_features_using_fisherscore(cls,
                                        data_frame,
                                        target_key,
                                        cols_to_ignore=None):
        X = data_frame.values
        keys = list(data_frame.keys())
        target_col_idx = keys.index(target_key)

        # Removing the target column from keys
        del keys[target_col_idx]

        # Remove all columns that are asked to be ignored
        if cols_to_ignore is not None:
            for col in cols_to_ignore:
                idx = keys.index(col)
                del keys[idx]

        Y = data_frame.loc[:, target_key].values
        X = data_frame.loc[:, keys]

        score = fisher_score.fisher_score(X, Y)
        rank = fisher_score.feature_ranking(score)
        ranked_features = [keys[i] for i in rank]
        return score, ranked_features, keys
Ejemplo n.º 18
0
def fischer_score_featureSelection(x, y):
    score = fisher_score.fisher_score(x, y)
    rank = score_to_rank(score)
    return rank
Ejemplo n.º 19
0
def fisher_score_FS(X_train, y_train):
    score = fisher_score.fisher_score(X_train, y_train)
    idx = fisher_score.feature_ranking(score)
    return (idx, score)
Ejemplo n.º 20
0
    scores4 = np.append(
        scores4, abs(pearsonscore[0])
    )  #absolute value because -1 or +1 represent perfect correlation

g1 = lambda e: e[1]
g10 = lambda e: e[1][0]
R4, _ = zip(*sorted(enumerate(sorted(enumerate(-scores4), key=g1)), key=g10))

#print scores4
formatted_scores4 = ['%.2f' % elem for elem in scores4]
print formatted_scores4
print R4

# ------------------------ Fisher Score ------------------------
print "Fisher Score:"
scores5 = fisher_score.fisher_score(X, y)

g1 = lambda e: e[1]
g10 = lambda e: e[1][0]
R5, _ = zip(*sorted(enumerate(sorted(enumerate(-scores5), key=g1)), key=g10))

#print scores5
formatted_scores5 = ['%.2f' % elem for elem in scores5]
print formatted_scores5
print R5

# ------------------------ Relief-F ------------------------
print "Relief-F:"
scores6 = reliefF.reliefF(X, y)

g1 = lambda e: e[1]
Ejemplo n.º 21
0
    def fit(self, X, y):

        idx = []

        if self.tp == 'ITB':

            if self.name == 'MRMR':
                idx = MRMR.mrmr(X,
                                y,
                                n_selected_features=self.params['num_feats'])

        elif self.tp == 'filter':

            if self.name == 'Relief':
                score = reliefF.reliefF(X, y, k=self.params['k'])
                idx = reliefF.feature_ranking(score)

            if self.name == 'Fisher':
                # obtain the score of each feature on the training set
                score = fisher_score.fisher_score(X, y)

                # rank features in descending order according to score
                idx = fisher_score.feature_ranking(score)

            if self.name == 'MI':
                idx = np.argsort(
                    mutual_info_classif(
                        X, y, n_neighbors=self.params['n_neighbors']))[::-1]

        elif self.tp == 'wrapper':

            model_fit = self.model.fit(X, y)
            model = SelectFromModel(model_fit, prefit=True)
            idx = model.get_support(indices=True)
        elif self.tp == 'SLB':

            # one-hot-encode on target
            y = construct_label_matrix(y)

            if self.name == 'SMBA':
                scba = fs.SCBA(data=X,
                               alpha=self.params['alpha'],
                               norm_type=self.params['norm_type'],
                               verbose=self.params['verbose'],
                               thr=self.params['thr'],
                               max_iter=self.params['max_iter'],
                               affine=self.params['affine'],
                               normalize=self.params['normalize'],
                               step=self.params['step'],
                               PCA=self.params['PCA'],
                               GPU=self.params['GPU'],
                               device=self.params['device'])

                nrmInd, sInd, repInd, _ = scba.admm()
                if self.params['type_indices'] == 'nrmInd':
                    idx = nrmInd
                elif self.params['type_indices'] == 'repInd':
                    idx = repInd
                else:
                    idx = sInd

            if self.name == 'RFS':
                W = RFS.rfs(X, y, gamma=self.params['gamma'])
                idx = feature_ranking(W)

            if self.name == 'll_l21':
                # obtain the feature weight matrix
                W, _, _ = ll_l21.proximal_gradient_descent(X,
                                                           y,
                                                           z=self.params['z'],
                                                           verbose=False)
                # sort the feature scores in an ascending order according to the feature scores
                idx = feature_ranking(W)
            if self.name == 'ls_l21':
                # obtain the feature weight matrix
                W, _, _ = ls_l21.proximal_gradient_descent(X,
                                                           y,
                                                           z=self.params['z'],
                                                           verbose=False)

                # sort the feature scores in an ascending order according to the feature scores
                idx = feature_ranking(W)

            if self.name == 'LASSO':

                LASSO = Lasso(alpha=self.params['alpha'], positive=True)

                y_pred_lasso = LASSO.fit(X, y)

                if y_pred_lasso.coef_.ndim == 1:
                    coeff = y_pred_lasso.coef_
                else:
                    coeff = np.asarray(y_pred_lasso.coef_[0, :])

                idx = np.argsort(-coeff)

            if self.name == 'EN':  # elastic net L1

                enet = ElasticNet(alpha=self.params['alpha'],
                                  l1_ratio=1,
                                  positive=True)
                y_pred_enet = enet.fit(X, y)

                if y_pred_enet.coef_.ndim == 1:
                    coeff = y_pred_enet.coef_
                else:
                    coeff = np.asarray(y_pred_enet.coef_[0, :])

                idx = np.argsort(-coeff)

        return idx
Ejemplo n.º 22
0
 def fisher_score_selection(X, y):
     # Wrapping skfeature Fisher score.
     scores = fisher_score(X, y)
     return np.argsort(scores, 0)[::-1]
Ejemplo n.º 23
0
                 tprs_upper,
                 color='grey',
                 alpha=.2,
                 label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC in Cervical Cancer Data Before FS')
plt.legend(loc="lower right")
plt.show()

#feature selection
for train, test in cv.split(X, y):
    score = fisher_score.fisher_score(X[train], y[train])
#  probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
# Compute ROC curve and area the curve

print(len(score))
idx = fisher_score.feature_ranking(score)
#print(idx)
num_fea = 6

#Have to explain why the machine pick up those and do the classification again
#X1 = ad[['NEK6','SLC2A4','SLC2A5','SUV_C34', 'SUVreduction']]
#data.iloc[[0,3,6,24], [0,5,6]]
X = pd.DataFrame(X)
X1 = X.iloc[:, [
    idx[0], idx[1], idx[2], idx[3], idx[4], idx[5], idx[6], idx[7], idx[8],
    idx[9], idx[10], idx[11]
Ejemplo n.º 24
0
mean_pos=np.mean(positive_feaure,axis=0)#正类中,各特征的平均值
mean_neg=np.mean(negtive_feature,axis=0)#负类中,各样本的平均值
std_pos=np.std(positive_feaure,ddof=1,axis=0)#正类中各特征值的标准差
std_neg=np.std(negtive_feature,ddof=1,axis=0)#负类中各特征值的标准差
F_up=np.square(mean_pos-mean_feature)+np.square(mean_neg-mean_feature)
F_down=np.square(std_pos)+np.square(std_neg)
F_score=F_up/F_down
"""
#------------calculate the FS score with scikit-feature package--------------#
from skfeature.function.similarity_based import fisher_score
from skfeature.function.information_theoretical_based import MRMR
from skfeature.function.similarity_based import reliefF
from skfeature.function.statistical_based import gini_index

Relief = reliefF.reliefF(datamat, labelmat)
Fisher= fisher_score.fisher_score(datamat, labelmat)
# mRMR,J,M,=MRMR.mrmr(datamat,labelmat,n_selected_features=80)
# mRMR=-mRMR
gini= gini_index.gini_index(datamat,labelmat)
gini=-gini
FSscore=np.column_stack((Relief,Fisher,gini))#合并三个分数

FSscore=ann.preprocess(FSscore)
FinalScore=np.sum(FSscore,axis=1)
FS=np.column_stack((FSscore,FinalScore))
FS_nor=ann.preprocess(FS)#将最后一列联合得分归一化
FS=pd.DataFrame(FS_nor,columns=["Relief", "Fisher","gini","FinalScore"],index=featurenames)
# FS.to_csv("F:\Githubcode\AdaBoost\myown\FSscore.csv")


sorteigen=FS.sort_values(by='FinalScore',ascending=False,axis=0)
Ejemplo n.º 25
0
            "metric": "euclidean",
            "neighbor_mode": "knn",
            "weight_mode": "heat_kernel",
            "k": 5,
            't': 1
        }
        W = construct_W.construct_W(X_train, **kwargs_W)
        score = lap_score.lap_score(X_train, W=W)
        idx = lap_score.feature_ranking(score)
        selected_fea_train = X_train[:, idx[0:num_features]]
        selected_fea_test = X_test[:, idx[0:num_features]]
        clf.fit(selected_fea_train, y_train)
        acc.append(accuracy_score(y_test, clf.predict(selected_fea_test)))

        # fisher_score
        score = fisher_score.fisher_score(X_train, y_train)
        idx = fisher_score.feature_ranking(score)
        selected_fea_train = X_train[:, idx[0:num_features]]
        selected_fea_test = X_test[:, idx[0:num_features]]
        clf.fit(selected_fea_train, y_train)
        acc.append(accuracy_score(y_test, clf.predict(selected_fea_test)))

        # reliefF
        score = reliefF.reliefF(X_train, y_train)
        idx = reliefF.feature_ranking(score)
        selected_fea_train = X_train[:, idx[0:num_features]]
        selected_fea_test = X_test[:, idx[0:num_features]]
        clf.fit(selected_fea_train, y_train)
        acc.append(accuracy_score(y_test, clf.predict(selected_fea_test)))

        # chi_square
Ejemplo n.º 26
0
    probas_ = clf.predict_proba(X[test])
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    acc = clf.score(X[test], y[test])
    accs.append(acc)
    i += 1

print(sum(aucs) / float(len(aucs)))
print(sum(accs) / float(len(accs)))

#Fisher score
score = fisher_score.fisher_score(X, y)
#print(len(score))
idx = fisher_score.feature_ranking(score)
#print(idx)
num_fea = 6
X_resampled = pd.DataFrame(X_resampled)
X1 = X_resampled.iloc[:, [
    idx[0], idx[1], idx[2], idx[3], idx[4], idx[5], idx[6], idx[7], idx[8],
    idx[9], idx[10], idx[11]
]]

#X1 = X.iloc[:, [idx[0], idx[1], idx[2], idx[3], idx[4]]]
X1 = pd.DataFrame(X1)
#print("Selected features {}".format(X1.columns.values))

X_train1, X_test1, y_train1, y_test1 = train_test_split(X1,
Ejemplo n.º 27
0
def fun_classify(inputFile, groupsSel, FeatSelect, Nfeats, scaleFeats=1):
    """
    AllStatsMean, AllStatsSTD = fun_classify(inputFile, groupsSel, FeatSelect, Nfeats)
    inputFile: the .csv file containt feature tables
    groups: The selected groups to classify. Full set is ["S","F","Z","N","O"],
    but ["S","F","Z"] are of most interest for the article (ictal, inter-ictal and normal EEG)
    FeatSelect: feature selection method: PCA, RFE, fisher or none
    Nfeats: number of selected features
    Returns:
    AllStatsMean: mean performance values
    AllStatsSTD: standard deviation of performance values  
    """
    #reads input features
    dfFeats = pd.read_csv(inputFile, sep=',', header=0)

    #only selected groups
    dfFeats = dfFeats[dfFeats["Group"].isin(groupsSel)]
    if "decTaime" in dfFeats:
        x = dfFeats.iloc[:, 2:]  #ignores decomposition method execution time
    else:
        x = dfFeats.iloc[:, 1:]
    y = dfFeats.iloc[:, 0].values
    if scaleFeats:  #scale feats?
        x = StandardScaler().fit_transform(x)
    #Feature selection
    if x.shape[1] > Nfeats:
        #RFE
        if FeatSelect == "RFE":
            rfeModel = SVC(kernel="linear",
                           C=0.025,
                           probability=True,
                           gamma='scale')
            rfeSelect = RFE(rfeModel, n_features_to_select=Nfeats)
            rfe_fit = rfeSelect.fit(x, y)
            x = x[:, rfe_fit.support_]

        if FeatSelect == "PCA":
            pca = PCA(n_components=Nfeats)
            x = pca.fit_transform(x)

        if FeatSelect == "fisher":
            fisherScore = fisher_score.fisher_score(x, y)
            idx = fisher_score.feature_ranking(fisherScore)
            x = x[:, idx[:Nfeats]]

    names = ["KNN", "Linear SVM", "RBF SVM", "GPC", "MLP"]

    classifiers = [
        KNeighborsClassifier(3),
        SVC(kernel="linear", C=0.025, probability=True, gamma='scale'),
        SVC(probability=True, gamma='scale'),
        GaussianProcessClassifier(1.0 * RBF(1.0)),
        MLPClassifier(alpha=1, max_iter=200)
    ]

    #initialize performance variable
    AllStats = {}
    AllStatsMean = {}
    AllStatsSTD = {}

    for name in names:
        AllStats[name] = {
            "Accuracy": np.zeros([realizations, K_folds]),
            "SensitivityMean": np.zeros([realizations, K_folds]),
            "SpecificityMean": np.zeros([realizations, K_folds]),
            "AUC_Mean": np.zeros([realizations, K_folds]),
            "SensitivityIctal": np.zeros([realizations, K_folds]),
            "SpecificityIctal": np.zeros([realizations, K_folds]),
            "AUC_Ictal": np.zeros([realizations, K_folds]),
            "TTtimes": np.zeros([realizations, K_folds])
        }
        AllStatsMean[name] = {
            "Accuracy": 0.,
            "SensitivityMean": 0.,
            "SpecificityMean": 0,
            "AUC_Mean": 0.,
            "SensitivityIctal": 0.,
            "SpecificityIctal": 0.,
            "AUC_Ictal": 0.,
            "TTtimes": 0.
        }
        AllStatsSTD[name] = {
            "Accuracy": 0.,
            "SensitivityMean": 0.,
            "SpecificityMean": 0,
            "AUC_Mean": 0.,
            "SensitivityIctal": 0.,
            "SpecificityIctal": 0.,
            "AUC_Ictal": 0.,
            "TTtimes": 0.
        }
        #for each realization
    for i in range(realizations):
        skf = StratifiedKFold(n_splits=K_folds,
                              shuffle=True)  #5-fold validation

        for tupTemp, ki in zip(skf.split(x, y), range(K_folds)):
            train_idx, test_idx = tupTemp[0], tupTemp[1]
            X_train, X_test = x[train_idx], x[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            for name, clf in zip(names, classifiers):  #for each classifier
                tic = time.time(
                )  #check training/testing time of each classifier
                #Fit model and predict
                modelFit = clf.fit(X_train, y_train)
                yPredicted = modelFit.predict(X_test)
                probsTest = modelFit.predict_proba(X_test)
                toc = time.time()
                # AUC -  #ictal class as positive
                if len(np.unique(y)) > 2:
                    AUCs = roc_auc_score(
                        LabelBinarizer().fit_transform(y_test),
                        probsTest,
                        average=None)
                else:
                    AUCs = roc_auc_score(y_test, probsTest[:, 1], average=None)
                #Sensitivity and Specificity
                cMatrix = confusion_matrix(y_test, yPredicted)
                FP = cMatrix.sum(axis=0) - np.diag(cMatrix)
                FN = cMatrix.sum(axis=1) - np.diag(cMatrix)
                TP = np.diag(cMatrix)
                TN = cMatrix.sum() - (FP + FN + TP)
                # Sensitivity
                TPR = TP / (TP + FN)
                # Specificity or true negative rate
                TNR = TN / (TN + FP)
                #fill performance variable
                AllStats[name]["Accuracy"][i, ki] = accuracy_score(
                    y_test, yPredicted)
                AllStats[name]["SensitivityMean"][i, ki] = np.mean(TPR)
                AllStats[name]["SpecificityMean"][i, ki] = np.mean(TNR)
                AllStats[name]["SensitivityIctal"][i, ki] = TPR[0]
                AllStats[name]["SpecificityIctal"][i, ki] = TNR[0]
                AllStats[name]["AUC_Mean"][i, ki] = np.mean(AUCs)
                AllStats[name]["TTtimes"][i, ki] = toc - tic
                if len(np.unique(y)) > 2:
                    AllStats[name]["AUC_Ictal"][i, ki] = AUCs[0]
    AllStatsDF = [0] * len(names)
    for idx, name in enumerate(names):
        for istat in AllStats[name].keys():
            AllStats[name][istat] = np.mean(AllStats[name][istat], axis=1)
            AllStatsMean[name][istat] = np.mean(AllStats[name][istat])
            AllStatsSTD[name][istat] = np.std(AllStats[name][istat])
        AllStatsDF[idx] = pd.DataFrame.from_dict(AllStats[name])
        AllStatsDF[idx]["Nmodes"] = Nmodes
        AllStatsDF[idx]["Classifier"] = name

    return pd.DataFrame.from_dict(AllStatsMean), pd.DataFrame.from_dict(
        AllStatsSTD), pd.concat(AllStatsDF)
Ejemplo n.º 28
0
    #ReliefF
    score_rel = reliefF.reliefF(X_train, y_train)
    idx_rel = reliefF.feature_ranking(score_rel)
    #Laplacian score
    kwargs_W = {
        "metric": "euclidean",
        "neighbor_mode": "knn",
        "k": 7,
        't': 1,
        'reliefF': True
    }
    W = construct_W.construct_W(X_train, **kwargs_W)
    score_lap = lap_score.lap_score(X_train, W=W)
    idx_lap = lap_score.feature_ranking(score_lap)
    #Fisher
    score_fish = fisher_score.fisher_score(X_train, y_train)
    print(score_fish)
    idx_fish = fisher_score.feature_ranking(score_fish)
    ###################################### Feature Integration
    idxM = idx_rel[:threshold]
    idxN = idx_lap[:threshold]
    idxO = idx_fish[:threshold]

    if combination_method == 1:
        #AND
        idx_and = reduce(np.intersect1d, (idxO, idxM, idxN))
        idx = idx_and
        print("number of selectes features (bins) = ", idx.shape[0])

    if combination_method == 2:
        #OR
# labels = train_df['TARGET'].values
# data = train_df[feats].as_matrix()  # only use training data for feature selection
# klass = DiscreteMrmr
# num_features = 50
# targets = labels.astype(bool)
# variables = data.astype(float)
# nrow, ncol = variables.shape
# selector = klass(num_features, klass.MID, THRESHOLD)
#    
# # b = time.time()
# ui = None
# maxrel, mrmr = selector._mrmr_selection(num_features, klass.MID, variables, targets, threshold=THRESHOLD, ui=ui)

cut_X = pd.qcut(train_X, 20,labels=False, retbins=True)
from skfeature.function.similarity_based import fisher_score
score = fisher_score.fisher_score(train_X, train_Y)

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

bestFeat = SelectKBest()
bestFeat.fit(train_X, train_Y)
feat_scr = zip(feats,bestFeat.scores_)
feat_scr = [f for f in feat_scr if not np.isnan(f[1])]
sorted_fetas = sorted(feat_scr, key=lambda k:k[1], reverse=True)

# estimator = SVR(kernel="linear")
# selector = RFE(estimator, 5, step=1)
# selector.fit(train_X, train_Y)  # slow
Ejemplo n.º 30
0
 def Fisher_Score(self):
     score = fisher_score.fisher_score(X_train, y_train)
     idx = fisher_score.feature_ranking(score)
Ejemplo n.º 31
0
#print("Features before feature selection: {}".format(X.columns.values))

#Get classes
y_data = ad['Label']
y = pd.DataFrame(y_data)
y = y.values.ravel()

#Save the resmapling data into npy
X_resampled = np.load('cervical_x.npy')
y_resampled = np.load('Cervical_y.npy')

X_resampled = pd.DataFrame(X_resampled)
X_resampled.columns = X.columns.values
cv = StratifiedKFold(n_splits=10)
for train, test in cv.split(X_resampled, y_resampled):
    score = fisher_score.fisher_score(X_resampled.iloc[train],
                                      y_resampled[train])

#print(score)
idx = fisher_score.feature_ranking(score)

#X1 = X_resampled.iloc[:, [idx[0], idx[1], idx[2], idx[3], idx[4], idx[5], idx[6], idx[7], idx[8], idx[9], idx[10], idx[11]]]

X1 = X_resampled.iloc[:, idx[0:11]]

#print(X_resampled.columns.values)

X_resampled = X1
print("Selected Features in Fisher{}".format(X_resampled.columns))
#Cross Validation

#Decision Tree
Ejemplo n.º 32
0
def my_fisher_score(X, y):
    return fisher_score(copy.deepcopy(X), y.flatten())
# labels = train_df['TARGET'].values
# data = train_df[feats].as_matrix()  # only use training data for feature selection
# klass = DiscreteMrmr
# num_features = 50
# targets = labels.astype(bool)
# variables = data.astype(float)
# nrow, ncol = variables.shape
# selector = klass(num_features, klass.MID, THRESHOLD)
#
# # b = time.time()
# ui = None
# maxrel, mrmr = selector._mrmr_selection(num_features, klass.MID, variables, targets, threshold=THRESHOLD, ui=ui)

cut_X = pd.qcut(train_X, 20, labels=False, retbins=True)
from skfeature.function.similarity_based import fisher_score
score = fisher_score.fisher_score(train_X, train_Y)

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

bestFeat = SelectKBest()
bestFeat.fit(train_X, train_Y)
feat_scr = zip(feats, bestFeat.scores_)
feat_scr = [f for f in feat_scr if not np.isnan(f[1])]
sorted_fetas = sorted(feat_scr, key=lambda k: k[1], reverse=True)

# estimator = SVR(kernel="linear")
# selector = RFE(estimator, 5, step=1)
# selector.fit(train_X, train_Y)  # slow