コード例 #1
0
    def rfecv_kfold(X_train, y_train, sk_model, out_dir, scoring="accuracy"):
        """RFECV(交差検証+再帰的特徴除去)"""
        def _plot_rfecv(selector):
            plt.xlabel("Number of features selected")
            plt.ylabel(
                "Cross validation score (nb of correct classifications)")
            plt.plot(range(1,
                           len(selector.grid_scores_) + 1),
                     selector.grid_scores_)
            plt.savefig(f"{out_dir}/plot_rfecv.png")

        # RFECVは交差検証+再帰的特徴除去。データでかいとメモリ死ぬので注意
        # RFE(再帰的特徴除去=recursive feature elimination: すべての特徴量を使う状態から、1つずつ特徴量を取り除いていく)で特徴量選択
        selector = RFECV(sk_model,
                         cv=KFold(3, shuffle=True),
                         scoring=scoring,
                         n_jobs=-1)
        selector.fit(X_train, y_train)
        # 探索履歴plot
        _plot_rfecv(selector)
        # 選択した特徴量
        select_cols = X_train.columns[selector.get_support()].to_list()
        print("\nselect_cols:\n", select_cols, len(select_cols))
        # 捨てた特徴量
        print("not select_cols:\n",
              X_train.columns[~selector.get_support()].to_list())
        # 選択した特徴量保存
        select_cols.append("y")
        pd.DataFrame({
            "select_cols": select_cols
        }).to_csv(f"{out_dir}/rfecv_select_cols.csv", index=False)
コード例 #2
0
def select_features_univariate(X, y, method='Decision_Tree'):
    """ with high dimensional datasets it aids classifier performance to select
    features of interest
    This function rejects features below a certain (univariate) threshold.


    Parameters
    ----------
    X : ndarray
            repetitions by features
    y     : ndarray
            vector of labels of each repetition
    method : string
            function used for data reduction
            {'decision_tree','decision_tree_RFECV','mutual_information',...
            'univariate_select'}
    Returns
    --------
    dictionary:
        X_transformed : ndarray
                repetitions by features (reduced)
        weights: ndarray or Boolean
                relative importance features or binary (important or not)

        """
    # based on the method we choose the clf to fit and transform the data
    if method == 'decision_tree_RFECV':
        clf = DecisionTreeClassifier()
        trans = RFECV(clf)
        X_transformed = trans.fit_transform(X, y)
        weights = trans.get_support()
    elif method == 'decision_tree':
        clf = DecisionTreeClassifier()
        clf.fit(X, y)
        # choose features with an importance that is more than avg.
        selected_features = np.where(
            clf.feature_importances_ > clf.feature_importances_.mean(0), 1, 0)
        X_transformed = X[:, selected_features == 1]
        weights = clf.feature_importances_
    elif method == 'mutual_information':
        mutual_info = mutual_info_classif(X, y)
        # choose features above the avg mutual information threshold.
        selected_features = np.where(mutual_info > mutual_info.mean(0), 1, 0)
        X_transformed = X[:, selected_features == 1]
        weights = mutual_info  #continuous
    elif method == 'univariate_select':
        # select features with more univariate activity than avg.
        trans = GenericUnivariateSelect(score_func=lambda X, y: X.mean(axis=0),
                                        mode='percentile',
                                        param=50)
        X_transformed = trans.fit_transform(X, y)
        weights = trans.get_support()  #binary

    return X_transformed, weights
コード例 #3
0
def feature_selection(X, y, estimator, cv=5, n_jobs=2):
    """
    Returns a list with the selected features.
    """
    rfecv = RFECV(estimator=estimator,
                  step=1,
                  cv=cv,
                  scoring='accuracy',
                  n_jobs=n_jobs,
                  verbose=0)
    rfecv.fit(X, y)
    features = rfecv.get_support(True)
    mask = rfecv.get_support()
    scores = rfecv.grid_scores_
    return features, mask, scores
コード例 #4
0
def RFECV_filter(df: DataFrame,
                 y: Series,
                 col_list: List,
                 estimator: Any,
                 keep: float = 0.5,
                 step: int = 1,
                 cv: int = 5) -> List:
    """
    递归特征(交叉验证)消除
    :param df:
    :param y:
    :param col_list:
    :param estimator: 使用的学习器
    :param keep: 保留特征数目或比例
    :param step: 每次递归的步长
    :param cv: 交叉验证折数
    :return:
    """
    if keep >= 1 and isinstance(keep, float):
        raise Exception('参数keep大于等于1时, 请输入整数')
    if isinstance(keep, float):
        keep = np.ceil(len(col_list) * keep)

    selector = RFECV(estimator,
                     min_features_to_select=keep,
                     step=step,
                     cv=cv,
                     scoring='roc_auc',
                     n_jobs=-1)
    selector = selector.fit(df[col_list], y)
    mask = selector.get_support()

    res = np.array(col_list)[mask].tolist()

    return res
コード例 #5
0
def find_best_features(df_train, y_train):
    rfr = RandomForestRegressor(n_estimators=500, max_depth=6, n_jobs=16)

    # vals_pearson = df_train.corr('pearson').values
    vals_pearson = joblib.load('vals_pearson.pkl')
    # vals_kendall = df_train.corr('kendall').values
    # vals_spearman = df_train.corr('spearman').values
    vals_spearman = joblib.load('vals_spearman.pkl')

    vals = (vals_pearson + vals_spearman) / 2

    dumped_cols = []
    res_cols = [True] * vals.shape[0]
    for i in range(vals.shape[0]):
        if i not in dumped_cols:
            for j in range(vals.shape[1]):
                if i != j:
                    if abs(vals[i, j]) > 0.90:
                        dumped_cols.append(j)
                        res_cols[j] = False

    #df_train2 = df_train[df_train.columns[res_cols]]

    rfecv = RFECV(
        rfr,
        step=10,  # Float step gives error on the end
        cv=5,
        scoring=rmse_scorer,
        verbose=2)

    # rfecv.fit(df_train2, y_train)
    rfecv = joblib.load('rfecv.pkl')

    return (res_cols, rfecv.get_support())
コード例 #6
0
def feature_selection(X, Y, outcome, method, imp_method, data_dir, verbose=0):
    if method not in ['RFE', 'PCA', 'ElasticNet']:
        raise Exception("{} not supported.".format(method))

    is_classf = Y.dtype == np.int8
    feature_subset_path = os.path.join(
        data_dir, 'feature_subset_{}_{}_{}.h5'.format(outcome, method,
                                                      imp_method))
    if os.path.exists(feature_subset_path):
        if verbose:
            print("Feature subset already exists. Loading {}...".format(
                feature_subset_path))
        with h5py.File(feature_subset_path, 'r') as hf:
            subset = hf[method][:]
        X_refined = X[:, subset]
        selector = None
    else:
        if method == 'RFE':
            if is_classf:
                selector = RFECV(LinearSVC(),
                                 step=0.1,
                                 cv=5,
                                 n_jobs=-1,
                                 verbose=verbose)
            else:
                selector = RFECV(LinearSVR(),
                                 step=0.1,
                                 cv=5,
                                 n_jobs=-1,
                                 verbose=verbose)
            X_refined = selector.fit_transform(X, Y)
        elif method == 'ElasticNet':
            selector = SelectFromModel(ElasticNetCV(cv=10, n_jobs=-1))
            X_refined = selector.fit_transform(X, Y)
        else:
            selector = None
            pca_path = os.path.join(
                data_dir, 'pca_comp_{}_{}.pkl'.format(outcome, imp_method))
            if os.path.exists(pca_path):
                print("PCA components already exist. Loading {}...".format(
                    pca_path))
                pca = joblib.load(pca_path)
                X_refined = pca.transform(X)
            else:
                var_thr = 0.99
                pca = PCA()
                x_pca = pca.fit_transform(X)
                index_pca = np.argmax(
                    pca.explained_variance_ratio_.cumsum() > var_thr)
                if verbose:
                    print("Number of selected features:", index_pca)
                pca = PCA(n_components=index_pca)
                X_refined = pca.fit_transform(X)
                joblib.dump(pca, pca_path)

    if selector:
        with h5py.File(feature_subset_path, 'w') as hf:
            hf.create_dataset(method, data=selector.get_support())

    return X_refined
コード例 #7
0
def perform_feature_reduction(x, y):
    """
    Performs feature reduction in the x, y

    For now, it uses linear SVR as estimator, and removes feature by feature.

    :param x: feature values
    :param y: labels
    :return: x, y, where x only contain the relevant features.
    """

    estimator = SVR(kernel="linear")
    selector = RFECV(estimator, step=1, cv=N_CV_FEATURE_REDUCTION)

    log("Features before reduction (total of {}): {}".format(
        len(x.columns.values), ', '.join(x.columns.values)))
    selector.fit(x, y)
    x = x[x.columns[selector.get_support(
        indices=True)]]  # keeping the column names

    log("Features after reduction (total of {}): {}".format(
        len(x.columns.values), ', '.join(x.columns.values)))
    log("Feature ranking: {}".format(', '.join(
        str(e) for e in selector.ranking_)))
    log("Feature grid scores: {}".format(', '.join(
        str(e) for e in selector.grid_scores_)))

    return x
コード例 #8
0
def sele_fea(X,y): # X is the data; y is the age

    
    #X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
    estimator = SVR(kernel="linear")
    # step: corresponds to the (integer) number of features to remove at each iteration
    # cv: 为要分成的包的总个数
    # 
    selector = RFECV(estimator, step=1, cv=2) 
    selector = selector.fit(X, y)
    sel_fea = selector.transform(X) # The sel_fea is with only the selected features
    fea_num = selector.n_features_
    sel_index = selector.get_support(True)
    print("Optimal number of features : %d" % selector.n_features_)
    # Plot number of features VS. cross-validation scores
    plt.figure()
#    plt.annotate('',xy = (np.argmax(selector.grid_scores_) + 1,selector.grid_scores_[np.argmax(selector.grid_scores_,)]), xytext = (np.argmin(results[1:80]),3+results[np.argmin(results[1:80])]), arrowprops=dict(facecolor='red',shrink=20))
#    plt.text(np.argmin(results[1:80])-6,(results[np.argmin(results[1:80])]-1),r'MAE = %.2f'%results[np.argmin(results[1:80])],fontsize = 10)
#    plt.text(np.argmin(results[1:80])-5,(results[np.argmin(results[1:80])]+3.5),r'K = %d'%np.argmin(results[1:80]),fontsize = 10)
    plt.xlabel("Number of features selected (K)")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(selector.grid_scores_) + 1),  selector.grid_scores_)
#    plt.savefig('F:/BrainAging/SDSU/test/Results/panel5_TD_mae.png',format = 'png',dpi = 1000)
    plt.show()
    return (sel_fea, fea_num, sel_index)
コード例 #9
0
ファイル: filter_1.py プロジェクト: ashinwz/lmmd_experiments
def rfe_filter(feature_filter, finger_name, finger_feature):
    from sklearn.svm import SVC
    svc = SVC(kernel="linear")
    rfecv = RFECV(estimator=svc,
                  step=1,
                  cv=StratifiedKFold(5),
                  scoring='roc_auc')
    rfecv.fit(finger_feature, label)
    rfecv_get = rfecv.get_support(indices=True)
    finger_three = finger_feature[rfecv_get]
    print "          ", finger_three.shape
    print("Optimal number of features : %d" % rfecv.n_features_)
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    path_2 = unicode("C:\Users\Administrator\Desktop\BBB_database_2\指纹数据\特征图片",
                     "utf-8")
    os.chdir(path_2)
    save_name = str(feature_filter) + "_" + str(
        rfecv.n_features_) + "_" + finger_name + "_" + "rfe.jpg"
    if save_name in os.listdir(path_2):
        save_name = str(feature_filter) + "_" + str(
            rfecv.n_features_
        ) + "_" + finger_name + "_" + "pca" + "_" + "rfe.jpg"
    plt.savefig(save_name)
    path = unicode("C:\Users\Administrator\Desktop\BBB_database_2\指纹数据\多个指纹",
                   "utf-8")
    os.chdir(path)
    return finger_three
コード例 #10
0
def find_best_features(df_train, y_train):
    rfr = RandomForestRegressor(n_estimators=500, max_depth=6, n_jobs=16)

    # vals_pearson = df_train.corr('pearson').values
    vals_pearson = joblib.load("vals_pearson.pkl")
    # vals_kendall = df_train.corr('kendall').values
    # vals_spearman = df_train.corr('spearman').values
    vals_spearman = joblib.load("vals_spearman.pkl")

    vals = (vals_pearson + vals_spearman) / 2

    dumped_cols = []
    res_cols = [True] * vals.shape[0]
    for i in range(vals.shape[0]):
        if i not in dumped_cols:
            for j in range(vals.shape[1]):
                if i != j:
                    if abs(vals[i, j]) > 0.90:
                        dumped_cols.append(j)
                        res_cols[j] = False

    # df_train2 = df_train[df_train.columns[res_cols]]

    rfecv = RFECV(rfr, step=10, cv=5, scoring=rmse_scorer, verbose=2)  # Float step gives error on the end

    # rfecv.fit(df_train2, y_train)
    rfecv = joblib.load("rfecv.pkl")

    return (res_cols, rfecv.get_support())
コード例 #11
0
class DFRFECV(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, **kwargs):
        self.columns = columns
        self.selector = RFECV(**kwargs)
        self.transform_cols = None
        self.stat_df = None

    def fit(self, X, y):
        self.columns = X.columns if self.columns is None else self.columns
        self.transform_cols = [x for x in X.columns if x in self.columns]
        self.selector.fit(X[self.transform_cols], y)

        self.stat_df = pd.DataFrame({
            'feature': X[self.transform_cols].columns,
            'ranking': self.selector.ranking_,
            'grid_score': self.selector.grid_scores_,
            'support': self.selector.get_support()
        })

        return self

    def transform(self, X):
        if self.transform_cols is None:
            raise NotFittedError(
                f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator."
            )

        features = self.stat_df[self.stat_df['support']]['feature'].values
        new_X = X[features].copy()

        return new_X

    def fit_transform(self, X, y):
        return self.fit(X, y).transform(X)
コード例 #12
0
def score_param(param, model, X, y, cv):
    # feature selection under these params
    selector = RFECV(model(**param), step=1, cv=cv)
    selector.fit(X, y)
    X_sel = selector.transform(X)
    # score for these params is CV score fitting on X_sel
    return np.mean(cross_val_score(model(**param), X_sel, y,
                                   cv=cv)), selector.get_support()
コード例 #13
0
class RecursiveFeatureEliminationSelector(Transformer):
    type = 23

    def __init__(self, param='lr', min_features=1):
        super().__init__("rfe_selector")
        self.input_type = [NUMERICAL, DISCRETE, CATEGORICAL]
        self.params = param
        self.min_features = min_features
        self.optional_params = ['lr', 'rf']

    def operate(self, input_datanode: DataNode, target_fields=None):
        from sklearn.feature_selection import RFECV

        feature_types = input_datanode.feature_types
        X, y = input_datanode.data
        if target_fields is None:
            target_fields = collect_fields(feature_types, self.input_type)
        X_new = X[:, target_fields]

        n_fields = len(feature_types)
        irrevalent_fields = list(range(n_fields))
        for field_id in target_fields:
            irrevalent_fields.remove(field_id)

        self.min_features = max(self.min_features, n_fields // 20)
        if self.model is None:
            if self.params == 'lr':
                from sklearn.linear_model import LogisticRegression
                base_model = LogisticRegression(solver='lbfgs')
            elif self.params == 'rf':
                from sklearn.ensemble import ExtraTreesClassifier
                base_model = ExtraTreesClassifier(n_estimators=100)
            else:
                raise ValueError('Invalid base model!')

            self.model = RFECV(base_model,
                               cv=3,
                               min_features_to_select=self.min_features)
            self.model.fit(X_new, y)

        _X = self.model.transform(X_new)
        is_selected = self.model.get_support()

        irrevalent_types = [feature_types[idx] for idx in irrevalent_fields]
        selected_types = [
            feature_types[idx] for idx in target_fields if is_selected[idx]
        ]
        selected_types.extend(irrevalent_types)

        new_X = np.hstack((_X, X[:, irrevalent_fields]))
        new_feature_types = selected_types
        output_datanode = DataNode((new_X, y), new_feature_types,
                                   input_datanode.task_type)
        output_datanode.trans_hist = input_datanode.trans_hist.copy()
        output_datanode.trans_hist.append(self.type)
        self.target_fields = target_fields.copy()

        return output_datanode
コード例 #14
0
def recursive_feature_selection(X, Y):
    svc = SVC(kernel="linear")
    rfecv = RFECV(estimator=svc,
                  step=1,
                  cv=StratifiedKFold(5),
                  scoring='accuracy')
    rfecv.fit(X, Y)
    mask = rfecv.get_support()
    return X[:, mask]
コード例 #15
0
    def selectFeatures(self, select_model):
        selector = RFECV(estimator=select_model, step=self.step, cv=self.cv)
        y = self.train[self.label]
        X = self.train.drop(self.label, axis=1)
        select_X = selector.fit_transform(X, y)
        select_features_index = selector.get_support(True)
        select_columns = X.columns[select_features_index]

        return select_X, select_columns
コード例 #16
0
def SelectRFE_DTCV(dataf, targetf):
    estimator = DecisionTreeClassifier()
    selector = RFECV(estimator, cv=3)
    data_new = selector.fit_transform(dataf.values, targetf.values.ravel())
    outcome = selector.get_support(True)
    new_features = []  # The list of your K best features
    for ind in outcome:
        new_features.append(dataf.columns.values[ind])
    return pd.DataFrame(data_new, columns=new_features)
コード例 #17
0
ファイル: Classifier.py プロジェクト: jessab/ML
def selectBestFeaturesRFECV(samples, classifications,
                            featureNames, classifierClass):
    fs = RFECV(classifierClass.getEstimator())
    if (not sprs.issparse(samples)):
        samples = sprs.csr_matrix(samples)
    samples = fs.fit_transform(samples.toarray(), classifications)
    sup = fs.get_support()
    
    featureNames = [featureNames[i] for (i,s) in enumerate(sup) if s]
    return [samples,featureNames]
コード例 #18
0
ファイル: actual.py プロジェクト: kenluck2001/AnswerClassify
def selectFeatures (clf, X, Y):
    # Create the RFE object and compute a cross-validated score.
    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(Y, 5),
                  scoring='accuracy')
    rfecv.fit(X, Y)
    lst = rfecv.get_support()
    indices = find(lst, True)
    return X[:, indices], indices
コード例 #19
0
ファイル: featuresSelection.py プロジェクト: Johayon/BGD-Work
def featureSelection(X,y):
	class RandomForestClassifierWithCoef(RandomForestClassifier):
	    def fit(self, *args, **kwargs):
	        super(RandomForestClassifierWithCoef, self).fit(*args, **kwargs)
	        self.coef_ = self.feature_importances_
	randfor = RandomForestClassifierWithCoef(n_estimators=35)
	rfecv = RFECV(estimator=randfor, step=1, cv=5,
	               scoring='accuracy',verbose=2)
	rfecv.fit(X,y)
	return X.columns[rfecv.get_support()]
コード例 #20
0
 def recursive_feature_elimination_withCV(self, estimator, y_train=None, feats=None,n_fold=5, step=1, scoring='accuracy'):
     data1 = self.data.copy()
     cv_split = ShuffleSplit(n_splits=n_fold, test_size=.2, train_size=.7,
                                             random_state=42)  # run model n_foldx with 70/20 split intentionally leaving out 10%
     clf_rfe = RFECV(estimator, step=step, scoring=scoring, cv=cv_split)
     if y_train is not None:
         clf_rfe.fit(data1[feats], y_train)
     else:
         clf_rfe.fit(data1[feats], data1['label'])
     X_rfe = data1[feats].columns.values[clf_rfe.get_support()]
     return X_rfe
コード例 #21
0
def feature_selection(df, sample):
    """runs feature selection algorithm to calculate
	feature importance

    Parameters
    ----------
    df : pd.DataFrame
        data
    sample : int
        flag variable, if whole datset take a sample

    Returns
    -------
    None
    """
    if not sample:
        #shuffle
        df = df.sample(frac=1)
        #df.head(df.shape[0] *80)
        df = df.head(500)

    y = df['hotel_cluster']
    X = df.drop(columns=['hotel_cluster'])

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=.2,
                                                        random_state=99)

    estimator = RandomForestClassifier(random_state=99, max_depth=10)
    print("Fitting RF Classifier for Feature Selection ... ")
    estimator.fit(X_train, y_train)

    selector = RFECV(estimator, cv=10, step=.50)
    print("Fitting feature selector ...")
    selector = selector.fit(X_train, y_train)

    print("FIT!")
    mask = selector.get_support()  #list of booleans
    features = []
    for b, feature in zip(mask, X_train.columns):
        if b:
            features.append(feature)

    print("Num feat: {}".format(selector.n_features_))
    print("Features: {}".format(features))

    plt.barh(range(X_train.shape[1]),
             estimator.feature_importances_,
             align='center')
    plt.yticks(np.arange(X_train.shape[1]), X_train.columns.values)
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')
    plt.show()
コード例 #22
0
ファイル: app.py プロジェクト: DataNinjas-code/prudential
def get_features():
    df_train = pd.read_csv("train.csv")
    importance_features_sorted = pd.read_csv("feature_ranking.csv")
    importance_features_sorted = importance_features_sorted.rename(
        columns={"Unnamed: 0": "features"})
    if request.method == 'POST':
        if request.form['important_features'].isnumeric():
            number_of_features = int(request.form['important_features'])
            X = df_train.drop('labels', 1)
            target = df_train['labels']

            estimator = LogisticRegression(penalty='l1',
                                           solver='saga',
                                           C=2,
                                           multi_class='multinomial',
                                           n_jobs=-1,
                                           random_state=42)
            rfecv = RFECV(estimator=estimator,
                          step=1,
                          cv=StratifiedShuffleSplit(1,
                                                    test_size=.2,
                                                    random_state=42),
                          scoring='accuracy')
            select_features_by_model = importance_features_sorted[
                importance_features_sorted['ranking'] <=
                number_of_features]['features'].tolist()
            rfecv.fit(X[select_features_by_model], target)

            plt.figure(figsize=(16, 9))
            plt.title('Recursive Feature Elimination with Cross-Validation',
                      fontsize=18,
                      fontweight='bold',
                      pad=20)
            plt.xlabel('Number of features selected', fontsize=14, labelpad=20)
            plt.ylabel('% Correct Classification', fontsize=14, labelpad=20)
            plt.plot(range(1,
                           len(rfecv.grid_scores_) + 1),
                     rfecv.grid_scores_,
                     color='#303F9F',
                     linewidth=3)
            plt.savefig('./static/features.png')

            rfecv_df = pd.DataFrame({'col': select_features_by_model})
            rfecv_df['rank'] = np.nan
            for index, support in enumerate(rfecv.get_support(indices=True)):
                rfecv_df.loc[support, 'rank'] = index
            for index, rank in enumerate(rfecv.ranking_ - 2):
                if rank >= 0:
                    rfecv_df.loc[index, 'rank'] = rfecv.n_features_ + rank
            rfecv_df.to_csv('features.csv')
            return redirect("/model")
        else:
            flash("Please enter a digit for the number of features to select!")
            return redirect("/feature")
コード例 #23
0
def RFE_score(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=1024,
                                                        stratify=y)
    selector = RFECV(model, cv=3, scoring='f1')
    selector.fit(X_train, y_train)
    y_pred = selector.predict(X_test)
    score = f1_score(y_test, y_pred)
    return selector.get_support(indices=True), score
コード例 #24
0
def SelectFeatures(featuresStructuresArray, labels):
    estimator = LogisticRegression('l2', False)

    featureNames = featuresStructuresArray.dtype.names
    featureData = castStructuredArrayToRegular(featuresStructuresArray)

    featuresSelector = RFECV(estimator, cv=8)
    featuresSelector.fit(featureData, labels)
    selectedIndices = featuresSelector.get_support()

    selectedFeatures = np.array(featureNames)[selectedIndices]
    return selectedFeatures
コード例 #25
0
def selectFeatures(clf, X, Y):
    # Create the RFE object and compute a cross-validated score.
    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    rfecv = RFECV(estimator=clf,
                  step=1,
                  cv=StratifiedKFold(Y, 5),
                  scoring='accuracy')
    rfecv.fit(X, Y)
    lst = rfecv.get_support()
    indices = find(lst, True)
    return X[:, indices], indices
コード例 #26
0
def SelectFeatures(featuresStructuresArray, labels):
    estimator = LogisticRegression('l2', False)

    featureNames = featuresStructuresArray.dtype.names
    featureData = castStructuredArrayToRegular(featuresStructuresArray)

    featuresSelector = RFECV(estimator, cv=8)
    featuresSelector.fit(featureData , labels)
    selectedIndices = featuresSelector.get_support()

    selectedFeatures = np.array(featureNames)[selectedIndices]
    return selectedFeatures
コード例 #27
0
def selF(j, X, y, flist):
    rfe = RFECV(LinearRegression(), step=1, cv=5)
    X = rfe.fit(X, y.ravel())
    #selectB =SelectKBest(f_regression, k=j)
    #X = selectB.fit_transform((X), y.ravel())
    p = rfe.get_support()
    my_feat = list()

    for i in np.arange(0, len(p)):
        if p[i] == True:
            my_feat.append(flist[i])
    print("Number of features after feature selection is", len(my_feat))
    return my_feat
コード例 #28
0
def RFECV_selector(train_x, train_y, k=10):
    from sklearn.svm import LinearSVC
    from sklearn.feature_selection import RFE
    from sklearn.feature_selection import RFECV
    svc = LinearSVC()
    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    selection = RFECV(estimator=svc, step=1, scoring='accuracy')
    selection.fit(train_x, train_y)
    print(
        '----------------------------feature importance -------------------------'
    )
    print selection.grid_scores_
    importance = selection.grid_scores_
    # print selection.n_features_

    # print(selection.variances_)
    print(
        '----------------------------- selected feature -------------------------'
    )
    print selection.get_support(indices=True)
    return selection, importance
コード例 #29
0
def features_selection_method(name, params, X_train, y_train, problem_size):

    indices = []

    if name == "variance_threshold":
        percent_to_keep = float(params)
        #sel = VarianceThreshold(threshold=(percent_to_keep * (1 - percent_to_keep)))
        sel = VarianceThreshold(threshold=(percent_to_keep))
        sel.fit_transform(X_train)

        indices = sel.get_support(indices=True)

    if name == "kbest":
        k_param = int(
            float(params) *
            problem_size)  # here it's a percent over the whole dataset
        model = SelectKBest(chi2, k=k_param).fit_transform(X_train, y_train)

        indices = model.get_support(indices=True)

    if name == "linearSVC":
        C_param = float(params)
        lsvc = LinearSVC(C=C_param, penalty="l1",
                         dual=False).fit(X_train, y_train)
        model = SelectFromModel(lsvc, prefit=True)

        indices = model.get_support(indices=True)

    if name == "tree":
        n_estimarors_param = int(params)
        clf = ExtraTreesClassifier(n_estimators=n_estimarors_param)
        clf = clf.fit(X_train, y_train)
        model = SelectFromModel(clf, prefit=True)

        indices = model.get_support(indices=True)

    if name == "rfecv":
        cv_param = int(params)
        # Create the RFE object and compute a cross-validated score
        svc = SVC(kernel="linear")
        # The "accuracy" scoring is proportional to the number of correct
        # classifications
        rfecv = RFECV(estimator=svc,
                      step=1,
                      cv=StratifiedKFold(cv_param),
                      scoring='roc_auc')
        rfecv.fit(X_train, y_train)

        indices = rfecv.get_support(indices=True)

    return indices
コード例 #30
0
ファイル: main.py プロジェクト: BorisBorshevsky/ML-Elections
def select_features_with_rfe(data_X, data_Y, feature_names):
    result = []

    svc = SVC(kernel="linear", C=1)
    rfecv = RFECV(estimator=svc, step=1, cv=3, scoring='accuracy')
    rfecv.fit(data_X, data_Y)

    print("RFE - Optimal number of features : %d" % rfecv.n_features_)

    for idx, val in enumerate(rfecv.get_support()):
        if val:
            print "RFE - Choosing feature: " + feature_names[idx]
            result.append(feature_names[idx])
    return result
コード例 #31
0
ファイル: rfecv.py プロジェクト: nexusme/data_process_tools
def rfecv(df, columns, target_col):
    X = df[columns]
    y = df[target_col]
    estimator = SVR(kernel="linear")
    selector = RFECV(estimator, step=1, cv=len(columns))
    selector = selector.fit(X, y)
    data = selector.transform(X)
    # get kept columns
    true_list = list(selector.get_support())
    index = [i for i in range(len(true_list)) if true_list[i] == True]
    saved_columns = [columns[i] for i in index]
    # save into dataframe
    result = pd.DataFrame(data, columns=saved_columns)
    result[target_col] = y
    return result
コード例 #32
0
def recursive_feature_elimination_cv(input_data,
                                     feature_names,
                                     step=0.1,
                                     cv=3,
                                     estimator=SVC(kernel='linear')):
    """
    Recursively elinates features from x_train and x_test with cross
    validation, uses scikit-learn's RFECV see documentation:
    http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html
    If feature_names is given it is also returned with any features from
    x_train and x_test also removed from feature_names.

    Args:
        input_data (tuple):     x_train, y_train, x_test, y_test
        feature_names:          The names of all features before feature
                                selection or None.
        estimator (object):     Passed to RFECV, see documentation
        step (int or float):    Passed to RFECV, see documentation
        cv (int):               Passed to RFECV, see documentation

    Returns:
        tuple: (x_train, y_train, x_test, y_test), feature_names, input_args
    """
    x_train = input_data[0]
    y_train = input_data[1]
    x_test = input_data[2]
    y_test = input_data[3]

    dims = len(x_train.shape)
    if dims == 3:
        x_train = flatten(x_train)
        x_test = flatten(x_test)
    feature_selector = RFECV(estimator, step, cv)
    x_train = feature_selector.fit_transform(x_train, y_train)
    x_test = feature_selector.transform(x_test)
    if dims == 3:
        x_train = make3D(x_train)
        x_test = make3D(x_test)

    output_data = (x_train, y_train, x_test, y_test)

    if feature_names is not None:
        mask = feature_selector.get_support()
        feature_names = feature_names[mask]

    args = {'step': step, 'cv': cv, 'estimator': estimator}

    return output_data, feature_names, args
コード例 #33
0
def variable_selection_model_fitting(train, test, model, columns):
    train_x, train_y = split_x_y(train.values)
    test_x, test_y = split_x_y(test.values)
    selection_model = LogisticRegression()

    rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(train_y, 10),
              scoring='accuracy')
    selector = rfecv.fit(train_x, train_y)
    rfe_features = []
    print rfecv.n_features_
    for col, selected in zip(columns, rfecv.get_support()):
        if selected:
            rfe_features.append(col)
    print rfe_features

    return rfe_features
コード例 #34
0
def svc_rfe_cv(dataset, label):
    """
    Performing recursive feature elimination using support vector classifier with 10 fold cross validation

    Args:
    dataset - training data
    label - trainig data labels

    Returns:
    A  list of most informative columns according to SVC_RFE
    """
    estimator = SVC(kernel="linear")
    selector = RFECV(estimator, min_features_to_select=100, step=1, cv=10)
    selector = selector.fit(dataset, label)
    training_data = dataset[dataset.columns[selector.get_support()]]

    return training_data
コード例 #35
0
def variable_selection_model_fitting(train, test, model, columns):
    train_x, train_y = split_x_y(train.values)
    test_x, test_y = split_x_y(test.values)
    selection_model = LogisticRegression()

    rfecv = RFECV(estimator=model,
                  step=1,
                  cv=StratifiedKFold(train_y, 10),
                  scoring='accuracy')
    selector = rfecv.fit(train_x, train_y)
    rfe_features = []
    print rfecv.n_features_
    for col, selected in zip(columns, rfecv.get_support()):
        if selected:
            rfe_features.append(col)
    print rfe_features

    return rfe_features
コード例 #36
0
ファイル: kdc.py プロジェクト: qianFX/final_project
    def recursive_feature_elimination(self, x: np.ndarray, y: np.ndarray, clf=None) -> np.ndarray:
        selector = RFECV(estimator=clf, step=1, cv=StratifiedKFold(y), scoring='accuracy', verbose=True)
        print("begin eliminate")
        selector.fit(x, y)

        print("Optimal number of features : %d" % selector.n_features_)

        # Plot number of features VS. cross-validation scores
        plt.figure()
        plt.xlabel("Number of features selected")
        plt.ylabel("Cross validation score (nb of correct classifications)")
        plt.plot(range(1, len(selector.grid_scores_) + 1), selector.grid_scores_)
        plt.show()

        selected_features = self.features[selector.get_support()]
        print(selected_features)
        x = selector.transform(x)
        return x
コード例 #37
0
ファイル: RFECV9.py プロジェクト: juanelenter/DNAi
def train_test(X_train, Y_train, X_test, Y_test, cv_params, custom_grid=False):

    if custom_grid:
        random_grid = load_grid(custom_grid)
    else:
        alpha = np.linspace(30000, 20000, 500)
        #solver = ['svd', 'cholesky', 'lsqr']

        # Create the random grid
        random_grid = {'alpha': alpha}
        #'solver' : solver}
    print_grid(random_grid)
    estimator = Ridge(alpha=90000)
    ridge_random = RFECV(estimator, step=500, cv=5, verbose=10)
    # Random search of parameters, using 3 fold cross validation,
    # search across 100 different combinations, and use all available cores
    #ridge_random = RandomizedSearchCV(selector, param_distributions = random_grid, n_iter = cv_params["n_iter"],
    #                                      cv = cv_params["cv"], verbose=10, random_state=42, n_jobs = cv_params["n_jobs"],
    #                                      pre_dispatch='2*n_jobs')
    ridge_random.fit(X_train, Y_train)

    best_grid_params = {'alpha': 30000}
    best_random = ridge_random.get_support()
    best_model_params = ridge_random.get_params()
    train_predictions = ridge_random.predict(X_train)
    test_predictions = ridge_random.predict(X_test)
    #metrics
    r_train = pearsonr(Y_train, train_predictions)
    r_test = pearsonr(Y_test, test_predictions)
    mse_train = mse(Y_train, train_predictions)
    mse_test = mse(Y_test, test_predictions)
    metrics = {
        "r_train": r_train,
        "r_test": r_test,
        "mse_train": mse_train,
        "mse_test": mse_test
    }
    print(f"pearsonr train: {r_train}")
    print(f"pearsonr test: {r_test}")
    print(f"mse train: {mse_train}")
    print(f"mse test: {mse_test}")
    print(best_model_params)
    return best_grid_params, best_model_params, train_predictions, test_predictions, metrics, {}
コード例 #38
0
ファイル: fe_rfecv.py プロジェクト: mb16/Kaggle
def run_feature_select(SEED):

    
    numFeatures = 80
    
    trainBaseTarget = pd.read_csv('../data/pre_shuffled_target.csv')
    trainBase = pd.read_csv('../data/pre_shuffled_train.csv')
    test = pd.read_csv('../data/pre_shuffled_test.csv')
    
    estimator = Ridge()
    selector = RFECV(estimator, step=20, cv=5, scoring=None) # NOT tested, must pass scoring function here.  
    selector.fit(trainBase, trainBaseTarget)
    cols = selector.get_support(indices=False)
  
    print(selector.grid_scores_) 
    print(selector.n_features_)      
     
    p = np.vstack([trainBase.columns,selector.ranking_])
    submission = pd.DataFrame(p.T, columns = None)
    submission.to_csv("../featureanalysis/RFECV_" + str(numFeatures) + ".csv")   
        
        
        
    gc.collect()      
    for index, col in enumerate(trainBase.columns):
        print("Column: " + col)
        if selector[index] == False and col != "var11":
            print("Dropping")
            trainBase.drop([col], axis=1, inplace=True)
    gc.collect()
    trainBase.to_csv("../models/RFECV_" + str(numFeatures) +  "_train.csv", index = False)
    
    
    gc.collect()
    for index, col in enumerate(test.columns):
        print("Column: " + col)
        if cols[index] == False and col != "var11":
            print("Dropping")
            test.drop([col], axis=1, inplace=True)
    gc.collect()
    test.to_csv("../models/RFECV_" + str(numFeatures) + "_test.csv", index = False)  
    gc.collect()                
コード例 #39
0
ファイル: PipeTasks.py プロジェクト: Sandy4321/ProFET
        'http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html '
        svc = LinearSVC(class_weight='auto')#,penalty='l1',dual=False)
        # svc = LogisticRegression(class_weight='auto')#,C=1)
        if FeatSelection_RFECV==True:
            rfecv = RFECV(estimator=svc, step=0.1,
                         cv=StratifiedShuffleSplit(y,n_iter=7,test_size=0.33),
                         scoring='f1',verbose=0)
            # " scoring='roc_auc','recall','f1'..."
        else:
            rfecv = RFE(estimator=svc,n_features_to_select=RFE_FeatsToKeep, step=0.1)
        rfecv.fit(X, y)
        if FeatSelection_RFECV==True:
            print("RFEcv selected %d number of Optimal features : " % (rfecv.n_features_))
        print("RFE (%d Features) scorer : \n" % (rfecv.n_features_),rfecv.score(X, y) )
        print("RFE selected feature names:")
        featureNames=featureNames[rfecv.get_support()]
        rfe_featnames = featureNames[rfecv.get_support()]
        print (rfe_featnames)
        X_RFE = rfecv.fit_transform(X, y)
        print(X_RFE.shape,"X_RFE \n")

        'Set GetRFEPerf To true or by user, if perf. of reduced set wanted'
        GetRFEPerf=False


    print("\n X: \n")
    ModelParam_GridSearch(X,y,cv=4)

    if GetRFEPerf==True:
        print("\n X-RFE: \n")
        ModelParam_GridSearch(X_RFE,y,cv=4)
コード例 #40
0
ファイル: poi_id.py プロジェクト: grace-pehl/enron
def FeatureSelection(data_dict, features_list):                
    # Convert dictionary to numpy array, converts NaN to 0.0                  
    data = featureFormat(data_dict, features_list, \
                         sort_keys = True, remove_all_zeroes = False)
    # Separate into labels = 'poi' and features = rest of features_list
    labels, features = targetFeatureSplit(data)
    
    from sklearn.feature_selection import RFECV 
    # Recursive Feature Elimination with Cross Validation
    from sklearn.svm import SVC
    # Support Vector Classifier to estimate fit coefficients for each feature
    from sklearn.cross_validation import StratifiedShuffleSplit
    # cross validation maintain roughly equal number of POIs in each split
    
    ### Create Estimator 
    # which will update the coefficients with each iteration
    # class weight is set to auto because of unbalanced data classes
    # weight will be inversely proportional to class size
    svc = SVC(kernel='linear', class_weight='auto', random_state=42)
    ############## Scale features ######################
    # SVC algorithm requires use scaled features
    # missing values are coded 0.0, so MinMax will preserve those zero values
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    features = scaler.fit_transform(features)
    
    ### Select cross-validation method
    # StratifiedShuffleSplit keeps roughly the same number of POIs in each split 
    sss = StratifiedShuffleSplit(labels, 100, test_size=0.3, random_state=42)
    ### Select evaluation metric
    # Evaluate model using f1 = 2 * (precision * recall) / (precision + recall)
    # Model should be able to predict POIs, which are a small percentage of cases
    metric = 'f1'
    # run the feature eliminater
    rfecv = RFECV(estimator=svc, cv=sss, scoring=metric, step=1)
    rfecv = rfecv.fit(features, labels)
    
    # view results
    import matplotlib.pyplot as plt
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score using F1 (precision&recall)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
#    plt.savefig('featureSelection.png', transparent=True)
    plt.show()
    print("Optimal number of features is %d" % rfecv.n_features_)
    print('Features selected by recursive feature elimination with cross validation:')
    F1_score = round(rfecv.grid_scores_[rfecv.n_features_], 3)
    print('F1 score from optimal features: %r' % F1_score)
    selection = rfecv.get_support()
    selected_features = ['poi']
    rejected_features = []
    for i in range(len(selection)):
        if selection[i]:
            selected_features.append(features_list[i + 1]) # first feature is 'poi'=the label
        else:
            rejected_features.append(features_list[i + 1])
    print(selected_features[1:])
    print('Features eliminated:')
    print(rejected_features)
    return selected_features, F1_score
コード例 #41
0
#  0.77320439  0.77538867  0.75253823  0.76103865  0.77505282  0.75834188
#  0.757514    0.76883208  0.77124053  0.7578164   0.76844945  0.76673323
#  0.76369039]


## let's plot out the results
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (ROC_AUC)")
plt.plot(range(1, len(rfe_cv.grid_scores_) + 1), rfe_cv.grid_scores_)
plt.show()
# notice you could have just as well have included the 4 most important
# features and received similar accuracy.

# you can pull out the features used this way:
features_used = explanatory_df.columns[rfe_cv.get_support()]
print features_used
# Index([u'atbats', u'totalruns', u'shutouts', u'teamID_Nothing'], dtype='object')

# you can extract the final selected model object this way:
final_estimator_used = rfe_cv.estimator_

# you can also combine RFE with grid search to find the tuning
# parameters and features that optimize model accuracy metrics.
# do this by passing the RFECV object to GridSearchCV.
from sklearn.grid_search import GridSearchCV

# doing this for a small range so I can show you the answer in a reasonable amount of time.
depth_range = range(4, 6)
# notice that in param_grid, I need to prefix estimator__ to my paramerters.
param_grid = dict(estimator__max_depth=depth_range)
コード例 #42
0
print "Optimal number of features: {0} of {1} considered".format(rfe_cv.n_features_,
len(df_mod.columns))

# pritning out socres as we increas the number of features -- the farther down the list
# the higher the number of features considered.
print rfe_cv.grid_scores_.mean()

# let's plot out the results
plt.figure()
plt.xlabel('Number of Features selected')
plt.ylabel('Cross Validation score (ROC_AUC)')
plt.plot(range(1, len(rfe_cv.grid_scores_)+1),rfe_cv.grid_scores_)
plt.show()

features_used = df.columns[rfe_cv.get_support()]
print features_used

# you can extract the final selected model object his way
final_estimator_used = rfe_cv.estimator_

# perform grid search to find the optimal number of trees

trees_range = range(10,550,10)
param_grid = dict(n_estimators = trees_range)


grid_rf = GridSearchCV(rf, param_grid, cv=10, scoring = 'roc_auc', verbose = 3)
grid_rf.fit(df_mod, response_series)
# check out the scores of the grid search
grid_rf_mean_scores = [result[1] for result in grid_rf.grid_rf_scores_]
コード例 #43
0
ファイル: modeling.py プロジェクト: mcrowson/predict-kiva
                        mean_score, scores.std() / 2, params)

                log.info(clf.best_estimator_)
            else:
                clf.fit(train_x, train_y)


            if model['name'] == 'Logistic Regression Classifier':
                # Recurive feature selection with 10-fold cross validation
                rfecv = RFECV(estimator=clf, step=1, cv=10,
                              scoring='roc_auc')

                rfecv.fit(train_x, train_y)
                clf_tmp = rfecv.estimator_

                mask = rfecv.get_support()
                log.debug('Logistic Regression Feature Estimates')
                for i in xrange(len(train_x.columns[mask])):
                    log.debug(': '.join([train_x.columns[mask][i], str(clf_tmp.coef_[0][i])]))

                log.debug("Optimal number of features : %d" % rfecv.n_features_)

                # Plot number of features VS. cross-validation scores
                plt.figure()
                plt.title("Optimal number of features: %d" % rfecv.n_features_)
                plt.xlabel("Number of features selected")
                plt.ylabel("Cross validation score (nb of correct classifications)")
                plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
                plt.savefig('./figs/results/%s_feature_selection.png' % model['name'])
                clf = rfecv
                pickle.dump(clf, open('pickled_objects/%s_classifier' % model['name'], "wb"))
コード例 #44
0
def main(args):
    if args.train_dir is None:
        # args.train_dir = '/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/chap/train/'
        #args.train_dir = '/cs/prt3/danofer/ProtFeat/feat_extract/test_seq/NP/SPCleaved_NP-70+NEG-30_Big-V3/'
#        args.train_dir =  r'D:\SkyDrive\Dropbox\bioInf_lab\AA_info\CODE\feat_extract\test_seq\NP\SPCleaved_NP-70+NEG-30_Big-V3'
        # args.train_dir =  r'E:\Dropbox\Dropbox\bioInf_lab\AA_info\fastas\NP\SP_Cleaved+NP+Neg_Big'
        args.train_dir =  r'E:\Dropbox\Dropbox\bioInf_lab\AA_info\fastas\Benchmarks\Thermophiles'
        print("Using default train_dir: %s" % args.train_dir)

    pandas.set_option('display.max_columns', 10)
    pandas.set_option('display.max_rows', 4)
    # mpl.rc('title', labelsize=6)
    mpl.rc('ytick', labelsize=7)
    mpl.rc('xtick', labelsize=4)

    os.chdir(args.train_dir)
    dataName = 'Neuropeptides'

    df = pandas.read_csv('trainingSetFeatures.csv')
    feature_cols = [col for col in df.columns if col not in ['classname','Id','proteinname']]
    feature_cols=numpy.array(feature_cols)

    X = df[feature_cols].values
    y = df.classname.values

    le = LabelEncoder()
    y = le.fit_transform(y)

    "Initial feature selection trimming"
    print(X.shape)

    Fwe = SelectFwe(alpha=0.01).fit(X,y)
    X=Fwe.transform(X)
    print("F-test -> ",X.shape)
    feature_cols=feature_cols[Fwe.get_support()]
    '''
    FeatSelection_SVM = True
    if FeatSelection_SVM == True:
        svc_L1 = LinearSVC(C=50, penalty="l1", dual=False,class_weight='auto').fit(X, y)
        X = svc_L1.transform(X, y)
        print ("L1 SVM Transformed X:",X_L1.shape)
        feature_cols=feature_cols[list(set(np.where(svc_L1.coef_ != 0)[-1]))]
    '''


    k = SelectKBest(k=255).fit(X,y)
    X=k.transform(X)
    feature_cols=feature_cols[k.get_support()]


    param_dist = {"max_depth": [6,9, None],
                  "max_features": ['auto',0.4],
                  "min_samples_leaf": [1,2,3],
                  "bootstrap": [True, False],
                  'min_samples_split':[2,3],
                  "criterion": [ "gini"],
                  "n_estimators":[100],
                  "n_jobs":[-1]}

    rf = RandomForestClassifierWithCoef(max_depth= 7, min_samples_split= 1, min_samples_leaf= 2, n_estimators= 50,  n_jobs= 2, max_features= "auto")

    "WARNING! F1 Score as implemented by Default in binary classification (two classes) gives the score for 1 class."

    scores = cross_validation.cross_val_score(rf,X,y,n_jobs=-1,cv=cross_validation.StratifiedShuffleSplit(y,n_iter=8,test_size=0.2))
    print("X RF Accuracy: %0.3f (+- %0.2f)" % (scores.mean(), scores.std() * 2))
    "Instead of scores_f1, we could also use precision, sensitivity, MCC (if binary), etc'."
    scores_f1 = cross_validation.cross_val_score(rf,X,y,n_jobs=-1,cv=cross_validation.StratifiedShuffleSplit(y,n_iter=8,test_size=0.2),scoring='f1')
    print("X RF f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))

    # rfeSelect = RFE(estimator=rf,n_features_to_select=16, step=0.04)
    rfeSelect = RFECV(estimator=rf,step=20, cv=2,scoring='f1') #average_precision , recall
    X_RFE = rfeSelect.fit_transform(X,y)
    print(X_RFE.shape)

    RFE_FeatureNames = feature_cols[rfeSelect.get_support()]
    print(RFE_FeatureNames)

    RFE_ScoreRatio = 100*(cross_validation.cross_val_score(rf,X_RFE,y,n_jobs=-1,cv=cross_validation.StratifiedShuffleSplit(y,n_iter=8,test_size=0.2),scoring='f1').mean())/scores_f1.mean()
    print("Even with just",X_RFE.shape[1]," features, we have %f performance! (f1 score ratio)" %(RFE_ScoreRatio))

    # PlotFeaturesImportance(X_RFE, y, RFE_FeatureNames, dataName)
    print("Alt plot:")
    altPlotFeaturesImportance(X_RFE, y, RFE_FeatureNames, dataName)
コード例 #45
0
ファイル: OutPutRes.py プロジェクト: MichaelDoron/ProFET
def GetAllPerf (filePaths=None):
    if filePaths is None:
        filePaths = list(find_files(directory='./test_seq', pattern='trainingSetFeatures.csv'))

    #Sanity check:
    # filePaths=['/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/test_seq/Thermophile']
    # filePaths=['./test_seq/NP/NP2/Train/trainingSetFeatures.csv']

    print("FilePaths: \n",filePaths)
    fileNames=fileNameFromPaths (filePaths)
    print("FileNames:",fileNames)


    resDict = pd.DataFrame(index=fileNames,
        columns=['Accuracy','Accuracy_SD',
        'f1','f1_SD','dummy_freq:Accuracy','dummy_freq:f1',
        'LargestClassPercent','Classes',
        # 'TopRFE-Features','Best (f1) Model parameters',
         '# Classes',
         'Array-Acc-Scores' ,'Array-f1-Scores'
         ,'bestML-Acc','bestML-f1','dummy_freq_f1_weighted'])


    #redDict holds results for each file/class, for saving to output-file

    i=-1
    for filePath in filePaths:
        i +=1

        'http://pythonconquerstheuniverse.wordpress.com/2008/06/04/gotcha-%E2%80%94-backslashes-in-windows-filenames/'
        filePath = os.path.normpath(filePath)
        print(filePath)
        fileName=str(fileNames[i]) #Str added now 14.1

        print("fileName: %s" %(fileName))
        "resDict['Name']= fileName"

        # filePath = str(argv[1])
        # X, y, lb_encoder,featureNames = load_data(filePath+fileName, 'file') # X, y = features, labels
        X, y, lb_encoder,featureNames = load_data(filePath, 'file') # X, y = features, labels
        print(X.shape,"= (samples, features)")
        y_inv = Counter(lb_encoder.inverse_transform(y))
        MajorityPercent = round(100*y_inv.most_common()[0][1]/sum(y_inv.values()),1)
        print("Classes:", lb_encoder.classes_)
        print("MajorityClassPercent:", MajorityPercent)

        resDict.LargestClassPercent[fileName] = MajorityPercent
        resDict.Classes[fileName] = str(lb_encoder.classes_)
        resDict["# Classes"][fileName]=len(lb_encoder.classes_)

        KFilt=None
        KFilt=350  #This is just temporary for the outputs - saves computation time. Barely filters compared to the model itself.

        if KFilt is not None:
            k = SelectKBest(k=KFilt).fit(X,y)
            X=k.transform(X)
            featureNames=featureNames[k.get_support()]

        Fwe = SelectFwe(alpha=0.01).fit(X,y)
        X=Fwe.transform(X)
        featureNames=featureNames[Fwe.get_support()]

        print("X reduced to K best features: ",X.shape)


        FeatSelection_SVM=False #Feature Names need updating!!
        FeatSelection_RandLogReg=False

        if FeatSelection_RandLogReg == True:
            LogRegFeats = RandomizedLogisticRegression(C=10, scaling=0.5,
             sample_fraction=0.95, n_resampling=40, selection_threshold=0.2,n_jobs=-1).fit(X,y)
            X_L1 = LogRegFeats.transform(X)
            featureNames=featureNames[LogRegFeats.get_support()]
            print("RandomizedLogisticRegression Feature Selection ->:",X_L1.shape)

        elif FeatSelection_SVM == True:
            svc_L1= LinearSVC(C=30, penalty="l2", dual=False,class_weight='auto').fit(X, y)
            X_L1 = svc_L1.transform(X, y)
            featureNames=featureNames[list(set(np.where(svc_L1.coef_ != 0)[-1]))]
            print ("L1 SVM Transformed X:",X_L1.shape)
        # X=X_L1

        '''
        print("Performance as a function of percent of features used:")
        PlotPerfPercentFeatures(X,y,est=LinearSVC())
        '''

        'EG - graph best features; feature selection using RF, ensemble classifiers..'
        'http://nbviewer.ipython.org/github/herrfz/dataanalysis/blob/master/assignment2/samsung_data_prediction_submitted.ipynb'

        RFE_FeatsToKeep = 16
        FeatSelection_RFE=False
        FeatSelection_RFECV=False

        if (FeatSelection_RFE or FeatSelection_RFECV) == True:
            'RFE + - best feats'
            'http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html '
            svc = LinearSVC(class_weight='auto')#,penalty='l1',dual=False)
            # svc = LogisticRegression(class_weight='auto')#,C=1)

            if FeatSelection_RFECV==True:
                rfecv = RFECV(estimator=svc, step=RFE_FeatsToKeep,scoring='average_precision')
                             # ,cv=StratifiedShuffleSplit(y,n_iter=3,test_size=0.3))
                             #,scoring='f1',verbose=0) # " scoring='roc_auc','recall','f1',accuracy..."
            else:
                rfecv = RFE(estimator=svc,n_features_to_select=RFE_FeatsToKeep, step=0.03)
            rfecv.fit(X, y)
            if FeatSelection_RFECV==True:
                print("RFE-CV selected %d features : " % (rfecv.n_features_))
            print("RFE (%d features) scorer : " % (rfecv.n_features_),rfecv.score(X, y) )
            rfe_featnames = featureNames[rfecv.get_support()]
            featureNames = featureNames[rfecv.get_support()]
            print("RFE selected feature names:",rfe_featnames)
            X_RFE = rfecv.fit_transform(X, y)
            print("X_RFE",X_RFE.shape)

            resDict['TopRFE-Features'][fileName]=str(rfe_featnames)

            'Set GetRFEPerf To true or by user, if perf. of reduced set wanted'
        GetRFEPerf=False

        # print("lb_encoder.classes_",lb_encoder.classes_)
        'Blind score boxplot graphic example using Seaborn: http://nbviewer.ipython.org/github/cs109/2014/blob/master/homework-solutions/HW5-solutions.ipynb '
        'Confusion matrixes + Dummies - http://bugra.github.io/work/notes/2014-11-22/an-introduction-to-supervised-learning-scikit-learn/'
        'http://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators'

        "http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html"
        print()

        "Make custom F1 scorer. May not have fixed problem!"
        from sklearn.metrics.score import make_scorer
        f1_scorer = make_scorer(metrics.f1_score,
                     greater_is_better=True, average="micro") #Maybe another metric? May NOT be fixed!?. #weighted, micro, macro, none

        # print("Dummy classifiers output:")

        dummy_frequent = DummyClassifier(strategy='most_frequent',random_state=0)
        y_dummyPred = Get_yPred(X,y,clf_class=dummy_frequent)
        dummy_freq_acc = '{:.3}'.format(metrics.accuracy_score(y,y_dummyPred ))
        dummy_freq_f1 = '{:.3}'.format(metrics.f1_score(y, y_dummyPred,average='weighted'))

        dummy_freq_f1_weighted = '{:.3}'.format(f1_scorer(y, y_dummyPred))
        #Get from ALL classes f1..
        dummy_freq_f1_mean=(metrics.f1_score(y, y_dummyPred,average=None)).mean()
        # print("Dummy, most frequent acc:",dummy_freq_acc)

        # dummy_stratifiedRandom = DummyClassifier(strategy='stratified',random_state=0)
        # dummy_strat2= '{:.3%}'.format(metrics.accuracy_score(y, Get_yPred(X,y,clf_class=dummy_frequent))) #,sample_weight=balance_weights(y)))
        # 'print("Dummy, Stratified Random:",dummy_strat2)'
        print()

        resDict['dummy_freq:Accuracy'][fileName]=dummy_freq_acc
##        resDict['dummy_freq:f1'][fileName]=dummy_freq_f1 dummy_freq_f1_mean
        resDict['dummy_freq:f1'][fileName]=dummy_freq_f1_mean

        resDict['dummy_freq_f1_weighted'][fileName]=dummy_freq_f1_weighted
        # resDict.dummy_Stratfreq[fileName]=dummy_strat2

        "We can get seperately the best model for Acc, and the best for f1!"
        "WARNING!? In binary case - default F1 works for the 1 class, in sklearn 15. and lower"
        # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')
        "Temporary workaround until next SKlearn update of F1 metric:"
        # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')f1_scorer
        bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = f1_scorer)

        bestEst_acc,bestScore_acc = ModelParam_GridSearch(X,y,cv=2,scoreParam = 'accuracy')
        print("bestEst (f1):",bestEst_f1)#,"best f1",bestScore_f1)
        print("bestEst (f1):",bestEst_acc)#,"best acc",bestScore_acc)

        #Temp
        # bestEst_f1=bestEst_acc=bestEst = RandomForestClassifier(n_jobs=-1)

        if GetRFEPerf==True:
            bestEst_RFE,bestScore_RFE = ModelParam_GridSearch(X_RFE,y,cv=3,scoreParam = 'f1')

        "Modified to get 2 estimators"
        scores_acc = cross_val_score(estimator=bestEst_acc, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1) #Accuracy
        print("Accuracy: %0.3f (+- %0.2f)" % (scores_acc.mean(), scores_acc.std() * 2))
        scores_f1 = cross_val_score(estimator=bestEst_f1, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1, scoring='f1')
        print("f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))

        resDict['Accuracy'][fileName]=round(scores_acc.mean(),4)
        resDict['Accuracy_SD'][fileName]=round(scores_acc.std(),4)
        resDict['f1'][fileName]=round(scores_f1.mean(),4)
        resDict['f1_SD'][fileName]=round(scores_f1.std(),4)
        resDict['Array-f1-Scores'][fileName]=(scores_f1)
        resDict['Array-Acc-Scores'][fileName]=(scores_acc)
        resDict['bestML-f1'][fileName]=(str(bestEst_f1))
        resDict['bestML-Acc'][fileName]=(str(bestEst_acc))

        #ORIG
        # Acc,Acc_SD,f1,f1_SD = CV_multi_stats(X, y, bestEst,n=15)

        # resDict['Accuracy'][fileName]=round(Acc,4)
        # resDict['Accuracy_SD'][fileName]=round(Acc_SD,4)
        # resDict['f1 score'][fileName]=round(f1,4)
        # resDict['f1_SD'][fileName]=round(f1_SD,4)
        # resDict['Best (f1) Model parameters'][fileName]= bestEst

        print()
        # print(fileName," Done")

    print("Saving results to file")
    resDict.to_csv("OutputData.tsv", sep=',')