コード例 #1
0
 def _select_features(self,
                      X,
                      y,
                      best,
                      save_path,
                      method='rfecv',
                      min_features=1):
     if best:
         if method == 'rfecv':
             print('Select best')
             rfecv = RFECV(estimator=LinearRegression(self.fit_intercept),
                           min_features_to_select=min_features,
                           cv=KFold(3),
                           scoring='neg_mean_squared_error',
                           n_jobs=-1)
             rfecv.fit_transform(X=X, y=y)
             self.bf_support_ = rfecv.support_
             self.bf_n_features_ = rfecv.n_features_
             # Results
             print("Optimal number of features : %d" % rfecv.n_features_)
             # Plot number of features VS. cross-validation scores
             plt.figure(figsize=figsize)
             plt.xlabel("Number of features selected")
             plt.ylabel("Cross validation score (neg mean squared error)")
             plt.plot(range(1,
                            len(rfecv.grid_scores_) + 1),
                      rfecv.grid_scores_)
             if save_path != None:
                 plt.savefig(save_path + 'rfecv_feature_selection.png')
             plt.show()
         if method == 'chi2':
             pass
     else:
         self.bf_support_ = [True] * X.shape[1]
コード例 #2
0
def feature_selection(X, Y, outcome, method, imp_method, data_dir, verbose=0):
    if method not in ['RFE', 'PCA', 'ElasticNet']:
        raise Exception("{} not supported.".format(method))

    is_classf = Y.dtype == np.int8
    feature_subset_path = os.path.join(
        data_dir, 'feature_subset_{}_{}_{}.h5'.format(outcome, method,
                                                      imp_method))
    if os.path.exists(feature_subset_path):
        if verbose:
            print("Feature subset already exists. Loading {}...".format(
                feature_subset_path))
        with h5py.File(feature_subset_path, 'r') as hf:
            subset = hf[method][:]
        X_refined = X[:, subset]
        selector = None
    else:
        if method == 'RFE':
            if is_classf:
                selector = RFECV(LinearSVC(),
                                 step=0.1,
                                 cv=5,
                                 n_jobs=-1,
                                 verbose=verbose)
            else:
                selector = RFECV(LinearSVR(),
                                 step=0.1,
                                 cv=5,
                                 n_jobs=-1,
                                 verbose=verbose)
            X_refined = selector.fit_transform(X, Y)
        elif method == 'ElasticNet':
            selector = SelectFromModel(ElasticNetCV(cv=10, n_jobs=-1))
            X_refined = selector.fit_transform(X, Y)
        else:
            selector = None
            pca_path = os.path.join(
                data_dir, 'pca_comp_{}_{}.pkl'.format(outcome, imp_method))
            if os.path.exists(pca_path):
                print("PCA components already exist. Loading {}...".format(
                    pca_path))
                pca = joblib.load(pca_path)
                X_refined = pca.transform(X)
            else:
                var_thr = 0.99
                pca = PCA()
                x_pca = pca.fit_transform(X)
                index_pca = np.argmax(
                    pca.explained_variance_ratio_.cumsum() > var_thr)
                if verbose:
                    print("Number of selected features:", index_pca)
                pca = PCA(n_components=index_pca)
                X_refined = pca.fit_transform(X)
                joblib.dump(pca, pca_path)

    if selector:
        with h5py.File(feature_subset_path, 'w') as hf:
            hf.create_dataset(method, data=selector.get_support())

    return X_refined
コード例 #3
0
def select_features_univariate(X, y, method='Decision_Tree'):
    """ with high dimensional datasets it aids classifier performance to select
    features of interest
    This function rejects features below a certain (univariate) threshold.


    Parameters
    ----------
    X : ndarray
            repetitions by features
    y     : ndarray
            vector of labels of each repetition
    method : string
            function used for data reduction
            {'decision_tree','decision_tree_RFECV','mutual_information',...
            'univariate_select'}
    Returns
    --------
    dictionary:
        X_transformed : ndarray
                repetitions by features (reduced)
        weights: ndarray or Boolean
                relative importance features or binary (important or not)

        """
    # based on the method we choose the clf to fit and transform the data
    if method == 'decision_tree_RFECV':
        clf = DecisionTreeClassifier()
        trans = RFECV(clf)
        X_transformed = trans.fit_transform(X, y)
        weights = trans.get_support()
    elif method == 'decision_tree':
        clf = DecisionTreeClassifier()
        clf.fit(X, y)
        # choose features with an importance that is more than avg.
        selected_features = np.where(
            clf.feature_importances_ > clf.feature_importances_.mean(0), 1, 0)
        X_transformed = X[:, selected_features == 1]
        weights = clf.feature_importances_
    elif method == 'mutual_information':
        mutual_info = mutual_info_classif(X, y)
        # choose features above the avg mutual information threshold.
        selected_features = np.where(mutual_info > mutual_info.mean(0), 1, 0)
        X_transformed = X[:, selected_features == 1]
        weights = mutual_info  #continuous
    elif method == 'univariate_select':
        # select features with more univariate activity than avg.
        trans = GenericUnivariateSelect(score_func=lambda X, y: X.mean(axis=0),
                                        mode='percentile',
                                        param=50)
        X_transformed = trans.fit_transform(X, y)
        weights = trans.get_support()  #binary

    return X_transformed, weights
コード例 #4
0
    def figs_of_RFE(self, model=None):
        # 展示:随着特征个数增加得分变化趋势图
        # 向后消除:该过程从所有特征集开始。通过逐步删除集合中剩余的最差特征。
        # 参数estimator为基模型

        selector = RFECV(estimator=model, scoring=self.score)
        selector.fit_transform(self.train_X, self.train_y)

        model_name = str(model).split('(')[0]
        plt.figure()
        plt.title('RFECV of {}'.format(model_name))
        plt.xlabel("Number of features selected")
        plt.ylabel("Cross validation score (nb of correct classifications)")
        plt.plot(range(1, len(selector.grid_scores_) + 1), selector.grid_scores_)
        plt.grid()
        plt.show()
コード例 #5
0
def transformer(train, test, train_y):
    """Scales and applies PCA to the input data.

    Parameters
    ----------
    train : DataFrame
        Features.
    test : DataFrame
        Features.
    test_y : numpy array
        Target.

    Returns
    -------
    train, test : numpy arrays
        Transformed Features.
    """
    train = train.dropna()
    test = test.dropna()
    scaler = RobustScaler().fit(train)
    train = scaler.transform(train)
    test = scaler.transform(test)
    clf = DecisionTreeClassifier()
    rfexv = RFECV(clf, cv=5)
    train = rfexv.fit_transform(train, train_y)
    test = rfexv.transform(test)
    return train, test
コード例 #6
0
    def RFEtrain(self, data):

        # # 1st
        # search_model = self._grid_search(data, self.label)
        # model = RFECV(estimator=search_model, step=1, cv=KFold(len(data)), scoring='accuracy', n_jobs=-1)
        # X = model.fit_transform(data, self.label)
        # plot_feature_selected(model, self.fsSavepath)
        # self.trainproc(X, search_model)

        # # 2nd
        # model = RFECV(estimator=self.svm, step=1, cv=KFold(len(data)), scoring='accuracy', n_jobs=-1)
        # X = model.fit_transform(data, self.label)
        # plot_feature_selected(model, self.fsSavepath)
        # self.trainproc(X, self.svm)

        # 3rd
        model = RFECV(estimator=self.svm,
                      step=1,
                      cv=KFold(len(data)),
                      scoring='accuracy',
                      n_jobs=-1)
        X = model.fit_transform(data, self.label)
        search_model = self._grid_search(X, self.label)
        plot_feature_selected(model, self.fsSavepath)
        self.trainproc(X, search_model)
コード例 #7
0
def plot_RFE(X,y):
    from sklearn.svm import SVC
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.feature_selection import RFECV
    from sklearn.datasets import make_classification
    from sklearn.metrics import zero_one_loss
    import pylab as pl
    import matplotlib.pylab as pl

    # Create the RFE object and compute a cross-validated score.
    # svc= SVC(kernel="linear", class_weight="auto", cache_size=1200, shrinking=True)
    svc=LinearSVC(penalty='l1', loss='l2', dual=False, class_weight='auto',multi_class='ovr')
#    SGD = SGDClassifier(penalty='elasticnet',class_weight='auto',n_jobs=-1,n_iter=10,l1_ratio =0.15)
##    rfecv = RFECV(estimator=svc, step=0.1, cv=StratifiedKFold(y, 5), scoring='roc_auc')
    rfecv = RFECV(estimator=svc, step=0.2,cv=StratifiedKFold(y, 2), scoring='f1')
    X_RFE = rfecv.fit_transform(X, y)

    print("Optimal number of features in X_RFE : %d" % rfecv.n_features_)
    # Plot number of features VS. cross-validation scores
    pl.figure()
    pl.xlabel("Number of features selected")
    pl.ylabel("Cross validation score (nb of misclassifications)")
    pl.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    pl.show()
    print ('RFE Opt.shapes features CV score:')
    CV_multi_stats(X_RFE,y,svc)
    return (X_RFE,rfecv)
コード例 #8
0
def featureSelectAndClassifyRFECV(X_train, X_test, y_train, y_test):

    scaler = MinMaxScaler()
    #scaler = StandardScaler()
    #scaler = RobustScaler()
    X_train_minmax = scaler.fit_transform(X_train)
    X_test_minmax = scaler.transform(X_test)

    #svc =svm.LinearSVC()
    rf = RandomForestClassifier(n_estimators=50, max_depth=20)

    rfecv = RFECV(estimator=rf,
                  step=1,
                  min_features_to_select=5,
                  cv=StratifiedKFold(5),
                  scoring='accuracy')

    X_train_transformed = rfecv.fit_transform(X_train_minmax, y_train)
    #X_train_transformed = rfecv.fit_transform(X_train, y_train)
    X_test_transformed = rfecv.transform(X_test_minmax)
    #X_test_transformed = rfecv.transform(X_test)
    score = rfecv.score(X_test_minmax, y_test)
    #score = rfecv.score(X_test, y_test)

    print('Optimal no. of features are ' + str(rfecv.n_features_))
    print('Score for test set is ' + str(score))
    print(rfecv.ranking_.shape)
    print(X_train_transformed.shape)
    print(X_test_transformed.shape)

    plt.figure()
    plt.xlabel('no. of features')
    plt.ylabel('cv score')
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()
コード例 #9
0
ファイル: wine.py プロジェクト: rupakc/UCI-Data-Analysis
def recursiveFeatureSelectorCV(classifier_model,train_data,train_labels,test_data,number_of_features):
    
    rfe = RFECV(classifier_model,number_of_features)
    transformed_train_data = rfe.fit_transform(train_data,train_labels)
    transformed_test_data = rfe.transform(test_data)
    
    return transformed_train_data,transformed_test_data
コード例 #10
0
ファイル: Model_Parameters_CV.py プロジェクト: zjx1230/ProFET
def plot_RFE(X, y):
    from sklearn.svm import SVC
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.feature_selection import RFECV
    from sklearn.datasets import make_classification
    from sklearn.metrics import zero_one_loss
    import pylab as pl
    import matplotlib.pylab as pl

    # Create the RFE object and compute a cross-validated score.
    # svc= SVC(kernel="linear", class_weight="auto", cache_size=1200, shrinking=True)
    svc = LinearSVC(penalty='l1',
                    loss='l2',
                    dual=False,
                    class_weight='auto',
                    multi_class='ovr')
    #    SGD = SGDClassifier(penalty='elasticnet',class_weight='auto',n_jobs=-1,n_iter=10,l1_ratio =0.15)
    ##    rfecv = RFECV(estimator=svc, step=0.1, cv=StratifiedKFold(y, 5), scoring='roc_auc')
    rfecv = RFECV(estimator=svc,
                  step=0.2,
                  cv=StratifiedKFold(y, 2),
                  scoring='f1')
    X_RFE = rfecv.fit_transform(X, y)

    print("Optimal number of features in X_RFE : %d" % rfecv.n_features_)
    # Plot number of features VS. cross-validation scores
    pl.figure()
    pl.xlabel("Number of features selected")
    pl.ylabel("Cross validation score (nb of misclassifications)")
    pl.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    pl.show()
    print('RFE Opt.shapes features CV score:')
    CV_multi_stats(X_RFE, y, svc)
    return (X_RFE, rfecv)
コード例 #11
0
def recursiveFeatureSelectorCV(classifier_model, train_data, train_labels,
                               test_data, number_of_features):

    rfe = RFECV(classifier_model, number_of_features)
    transformed_train_data = rfe.fit_transform(train_data, train_labels)
    transformed_test_data = rfe.transform(test_data)

    return transformed_train_data, transformed_test_data
コード例 #12
0
    def selectFeatures(self, select_model):
        selector = RFECV(estimator=select_model, step=self.step, cv=self.cv)
        y = self.train[self.label]
        X = self.train.drop(self.label, axis=1)
        select_X = selector.fit_transform(X, y)
        select_features_index = selector.get_support(True)
        select_columns = X.columns[select_features_index]

        return select_X, select_columns
コード例 #13
0
def SelectRFE_DTCV(dataf, targetf):
    estimator = DecisionTreeClassifier()
    selector = RFECV(estimator, cv=3)
    data_new = selector.fit_transform(dataf.values, targetf.values.ravel())
    outcome = selector.get_support(True)
    new_features = []  # The list of your K best features
    for ind in outcome:
        new_features.append(dataf.columns.values[ind])
    return pd.DataFrame(data_new, columns=new_features)
コード例 #14
0
 def _feature_selection(self, data_matrix, target):
     try:
         # perform recursive feature elimination
         feature_selector = RFECV(self.estimator, step=self.step, cv=self.cv)
         data_matrix_out = feature_selector.fit_transform(data_matrix, target)
         self.feature_selectors.append(feature_selector)
         return data_matrix_out
     except Exception as e:
         logger.debug(e)
         return data_matrix
コード例 #15
0
ファイル: embedding.py プロジェクト: gianlucacorrado/EDeN
def feature_selection(data_matrix, target):
    from sklearn.feature_selection import RFECV
    from sklearn.linear_model import SGDClassifier
    estimator = SGDClassifier(average=True, shuffle=True, penalty='elasticnet')
    # perform feature rescaling with elastic penalty
    data_matrix = estimator.fit_transform(data_matrix, target)
    # perform recursive feature elimination
    selector = RFECV(estimator, step=0.1, cv=10)
    data_matrix = selector.fit_transform(data_matrix, target)
    return data_matrix
コード例 #16
0
ファイル: embedding.py プロジェクト: teresa-m/EDeN
def feature_selection(data_matrix, target):
    from sklearn.feature_selection import RFECV
    from sklearn.linear_model import SGDClassifier
    estimator = SGDClassifier(average=True, shuffle=True, penalty='elasticnet')
    # perform feature rescaling with elastic penalty
    data_matrix = estimator.fit_transform(data_matrix, target)
    # perform recursive feature elimination
    selector = RFECV(estimator, step=0.1, cv=10)
    data_matrix = selector.fit_transform(data_matrix, target)
    return data_matrix
コード例 #17
0
ファイル: Classifier.py プロジェクト: jessab/ML
def selectBestFeaturesRFECV(samples, classifications,
                            featureNames, classifierClass):
    fs = RFECV(classifierClass.getEstimator())
    if (not sprs.issparse(samples)):
        samples = sprs.csr_matrix(samples)
    samples = fs.fit_transform(samples.toarray(), classifications)
    sup = fs.get_support()
    
    featureNames = [featureNames[i] for (i,s) in enumerate(sup) if s]
    return [samples,featureNames]
コード例 #18
0
def pre_process(data_fname=None,
                target_fname=None,
                correlation_transformation=None,
                normalization=None,
                feature_selection=None,
                min_threshold=None,
                max_threshold=None,
                random_state=1):
    """Process data."""
    # load data
    data_matrix, gene_names, instance_names = _loaddata_matrix(data_fname)

    # prepare target
    y_orig, target_names = _load_target(target_fname)
    y_sel = _select_targets(y_orig,
                            min_threshold=min_threshold,
                            max_threshold=max_threshold)
    logger.info('original num classes: %d' % len(set(y_orig)))
    logger.info('selected %d classes with more than %d instances' %
                (len(y_sel), min_threshold))
    data_matrix, y_orig_sel = _filter_dataset(data_matrix, y_orig, y_sel)
    rows, cols = data_matrix.shape
    logger.info('num instances:%d  num features:%d' % (rows, cols))
    lenc = LabelEncoder()
    y = lenc.fit_transform(y_orig_sel)
    y = np.array(y)
    target_dict = dict()
    for i, c in enumerate(lenc.classes_):
        target_dict[i] = target_names[c]

    # normalization
    if normalization:
        logger.info('Normalization')
        data_matrix = normalize(data_matrix)

    # feature selection
    if feature_selection:
        estimator = SGDClassifier(random_state=random_state)
        cv = StratifiedKFold(n_splits=5,
                             shuffle=True,
                             random_state=random_state)
        selector = RFECV(estimator, step=20, cv=cv)
        data_matrix = selector.fit_transform(data_matrix, y)
        logger.info('Feature selection')
        rows, cols = data_matrix.shape
        logger.info('num instances:%d  num features:%d' % (rows, cols))

    # prepare data matrix
    if correlation_transformation:
        data_matrix = np.corrcoef(data_matrix)
        logger.info('Correlation coefficient transformation')
        rows, cols = data_matrix.shape
        logger.info('num instances:%d  num features:%d' % (rows, cols))
    return data_matrix, y, target_dict
コード例 #19
0
class RFECVFeatureSelection:
    def __init__(self, estimator):
        self._rfecv = RFECV(estimator=estimator,
                            cv=StratifiedKFold(5),
                            scoring='recall')

    def execute(self, dataset):
        print('===== Feature selection - RFECV =====')
        dataset['features'] = self._rfecv.fit_transform(
            dataset['features'].toarray(), dataset['categories'])
        print(dataset['features'].shape)
        return dataset
 def _feature_selection(self, data_matrix, target):
     try:
         # perform recursive feature elimination
         step = max(int(data_matrix.shape[1] * self.step), 1)
         feature_selector = RFECV(self.estimator, step=step, cv=self.cv)
         data_matrix_out = feature_selector.fit_transform(
             data_matrix, target)
         self.feature_selectors.append(feature_selector)
         return data_matrix_out
     except Exception as e:
         logger.debug(e)
         return data_matrix
コード例 #21
0
ファイル: mlp_cqq.py プロジェクト: ashinwz/lmmd_experiments
def rfe_filter(feature_filter,finger_feature):
    from sklearn.svm import SVC
    global label
    label=data["n_np"].replace({"p":1,"n":0})
    svc=SVC(kernel="linear")
    rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(5), scoring='roc_auc')
    finger_three=rfecv.fit_transform(finger_feature, label)
    #rfecv_get = rfecv.get_support(indices=True)
    #finger_three=finger_feature[rfecv_get]
    print "          ",finger_three.shape
    print("Optimal number of features : %d" % rfecv.n_features_)
    return finger_three
コード例 #22
0
def trainModel(model, db, indexes, tests, goal):
    X = db.copy().drop(goal, axis=1)
    X = X.loc[indexes, :]
    y = db.copy()[goal]
    y = y[indexes]

    high_score = 0
    score_list = []
    topTRF = 0
    topFeatsRF = 0
    topFeatsPosRF = 0
    topFeatsRankRF = 0
    featValsRF = 0
    topModel = RandomForestRegressor(n_estimators=100)
    topX_train = 0
    topX_test = 0
    topy_train = 0
    topy_test = 0

    for t in tests:
        print(t)
        #Variable to store the optimum features

        for n in range(1, len(X.columns)):
            X_train, X_test, y_train, y_test = train_test_split(X,
                                                                y,
                                                                test_size=t,
                                                                random_state=0)
            model = RandomForestRegressor(n_estimators=100)
            rfe = RFECV(model, n, cv=10)
            X_train_rfe = rfe.fit_transform(X_train, y_train)
            X_test_rfe = rfe.transform(X_test)
            model.fit(X_train_rfe, y_train)
            score = model.score(X_test_rfe, y_test)
            score_list.append(score)

            if (score > high_score and rfe.n_features_ > 9
                    and rfe.n_features_ < 40):
                topTRF = t
                high_score = score
                nof = rfe.n_features_
                topFeatsPosRF = rfe.support_
                topFeatsRF = X.columns[topFeatsPosRF]
                topFeatsRankRF = rfe.ranking_
                featValsRF = model.feature_importances_
                topModel.fit(X_train_rfe, y_train)
                topX_train = X_train
                topX_test = X_test
                topy_train = y_train
                topy_test = y_test
                print("Score with %d features: %f" % (nof, high_score))

    return topModel, topFeatsRF, topFeatsPosRF, topX_train, topX_test, topy_train, topy_test
コード例 #23
0
def rfecv_fc(X,y,estimator):
    print('RFECV FEATURE SELECTION:')
    estimator = estimator
    selector = RFECV(estimator, step=1, cv=5)
    og_X = pd.DataFrame(X)
    X = selector.fit_transform(og_X, y)   
    print('Optimal number of features :', selector.n_features_)
    print('Best features index :', og_X.columns[selector.support_])
    print('Best features:')
    for x in og_X.columns[selector.support_]:
        print(tmp[x])
    ml_alg(X,y)
    return og_X.columns[selector.support_].tolist() 
コード例 #24
0
 def apply(self, X_mat, y_train):
     rfe_settings = self.rfe_settings
     kwargs = rfe_settings['kwargs']
     #estimator_class_name = kwargs['estimator']
     #current_mod = importlib.import_module('feature.selection')
     #estimator_class = getattr(current_mod,estimator_class_name)
     #estimator = estimator_class()
     estimator = SVC(kernel='linear')
     remaining_kwargs_keys = filter(lambda x: x not in ['estimator'],
                                    kwargs.keys())
     remaining_kwargs = {k: kwargs[k] for k in remaining_kwargs_keys}
     selector = RFECV(estimator, **remaining_kwargs)
     X_filt = selector.fit_transform(X_mat, y_train)
     return pd.DataFrame(X_filt)
コード例 #25
0
def feature_selection(train_data, train_target, test_data, unknown_data):
    """ Selects features based on cross validation with Lasso 
        This method determined the above removed columns
        Not calling it everytime, because it takes ages to run
    """
    lasso = Lasso()
    selector = RFECV(lasso, cv=3)

    train = selector.fit_transform(train_data, train_target)
    test = selector.transform(test_data)
    unknown = selector.transform(unknown_data)

    print(selector.support_)  # mask of used and deleted columns
    return (train, test, unknown)
コード例 #26
0
def RFECV_DT(df, test_size=0.3, cv=5, min_features_to_select=7, max_depth=4):
    X = df.drop(['class'], axis=1)
    y = df['class']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = 1, stratify=y)
    dt_rfecv = DecisionTreeClassifier(max_depth=max_depth)
    rfecv = RFECV(dt_rfecv, min_features_to_select=min_features_to_select, cv=StratifiedKFold(cv))
    X_train_rfecv = rfecv.fit_transform(X_train,y_train)
    X_test_rfecv = rfecv.transform(X_test)
 
    columns = X.columns
    rank = pd.DataFrame({'feature': columns, 'rank': list(rfecv.ranking_)})
    rank = rank.sort_values(by=['rank'], ascending = True)
    top_rank = df[rank['feature'][rank['rank']==1]]
    top_rank = pd.concat([top_rank, df['class']], axis=1)
    return top_rank
コード例 #27
0
def recursive_feature_elimination_cv(input_data,
                                     feature_names,
                                     step=0.1,
                                     cv=3,
                                     estimator=SVC(kernel='linear')):
    """
    Recursively elinates features from x_train and x_test with cross
    validation, uses scikit-learn's RFECV see documentation:
    http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html
    If feature_names is given it is also returned with any features from
    x_train and x_test also removed from feature_names.

    Args:
        input_data (tuple):     x_train, y_train, x_test, y_test
        feature_names:          The names of all features before feature
                                selection or None.
        estimator (object):     Passed to RFECV, see documentation
        step (int or float):    Passed to RFECV, see documentation
        cv (int):               Passed to RFECV, see documentation

    Returns:
        tuple: (x_train, y_train, x_test, y_test), feature_names, input_args
    """
    x_train = input_data[0]
    y_train = input_data[1]
    x_test = input_data[2]
    y_test = input_data[3]

    dims = len(x_train.shape)
    if dims == 3:
        x_train = flatten(x_train)
        x_test = flatten(x_test)
    feature_selector = RFECV(estimator, step, cv)
    x_train = feature_selector.fit_transform(x_train, y_train)
    x_test = feature_selector.transform(x_test)
    if dims == 3:
        x_train = make3D(x_train)
        x_test = make3D(x_test)

    output_data = (x_train, y_train, x_test, y_test)

    if feature_names is not None:
        mask = feature_selector.get_support()
        feature_names = feature_names[mask]

    args = {'step': step, 'cv': cv, 'estimator': estimator}

    return output_data, feature_names, args
コード例 #28
0
    def select_by_RFECV(self, model=None):
        # 展示:随着特征个数增加得分变化趋势图
        # 向后消除:该过程从所有特征集开始。通过逐步删除集合中剩余的最差特征。
        # 参数estimator为基模型

        selector = RFECV(estimator=model)
        f = selector.fit_transform(self.train_X, self.train_y)
        # grid_scores = list(map(lambda x: round(x, 4), selector.grid_scores_))
        # print("随着特征个数增加,得分变化 : {}".format(grid_scores))

        model_name = str(model).split('(')[0]
        plt.figure()
        plt.title('RFECV of {}'.format(model_name))
        plt.xlabel("Number of features selected")
        plt.ylabel("Cross validation score (nb of correct classifications)")
        plt.plot(range(1, len(selector.grid_scores_) + 1), selector.grid_scores_)
        plt.grid()
        plt.show()
コード例 #29
0
def performFS(texto,clfFS):
  skf = StratifiedKFold(y, random_state=0, n_folds=3)
  #print "FS tuned by " + texto
  
  fs = HmbFS(clfFS, skf, 1.5,1.5)
  X_withHmbFS = fs.fit_transform(X,y)
  #print "Done0"

  fs = HmbFS(clfFS, skf, 1.0,4.0)
  X_withHmbFSCV = fs.fit_transform(X,y)
  #print "Done1"

  #competencia = RFECV(clfFS, step=0.1, cv=skf, scoring='accuracy')
  stepSize=int(np.ceil(len(X[0])/10.0))
  competencia = RFECV(clfFS, step=stepSize, cv=skf, scoring='accuracy', verbose=0)
  X_withRFECV = competencia.fit_transform(X,y)
  #print "Done2"

  return X_withHmbFS,X_withHmbFSCV,X_withRFECV
コード例 #30
0
ファイル: main.py プロジェクト: xdjwolf/BrainNet-ML-ToolBox
def feature_selection(train_x, train_y, test_x):

    """
    The method uses Recursive Feature Elimination Feature method to choose subset of features.
    It is a wrapper method of feature selection techniques.
    The main purpose is to reduce the dimension of the samples to avoid curse of dimensionality.

    Parameters
    ----------
    train_x: features of training data
    test_x: features of testing data
    """

    svc = SVC(kernel="linear")
    rfecv = RFECV(estimator=svc, step=1, cv=ShuffleSplit(n_splits=10, test_size=0.25, random_state=0),
                  n_jobs=-1, scoring='accuracy')

    reduced_train_x = rfecv.fit_transform(train_x, train_y)
    reduced_test_x = rfecv.transform(test_x)
    return reduced_train_x, reduced_test_x
コード例 #31
0
def test_refcv():
    # 加载数据
    iris = load_iris()
    x, y = iris.data, iris.target
    # 特征提取
    estimator = LinearSVC()
    selector = RFECV(estimator, cv=5)
    x_t = selector.fit_transform(x, y)
    # 切分测试集和验证集
    x_train, x_test, y_train, y_test = model_selection.train_test_split(
        x, y, test_size=0.25, random_state=0, stratify=y)
    x_train_t, x_test_t, y_train_t, y_test_t = model_selection.train_test_split(
        x_t, y, test_size=0.25, random_state=0, stratify=y)
    # 测试和验证
    clf = LinearSVC()
    clf_t = LinearSVC()
    clf.fit(x_train, y_train)
    clf_t.fit(x_train_t, y_train_t)
    print(clf.score(x_test, y_test))
    print(clf_t.score(x_test_t, y_test_t))

    pass
コード例 #32
0
def main(args):
    if args.train_dir is None:
        # args.train_dir = '/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/chap/train/'
        #args.train_dir = '/cs/prt3/danofer/ProtFeat/feat_extract/test_seq/NP/SPCleaved_NP-70+NEG-30_Big-V3/'
#        args.train_dir =  r'D:\SkyDrive\Dropbox\bioInf_lab\AA_info\CODE\feat_extract\test_seq\NP\SPCleaved_NP-70+NEG-30_Big-V3'
        # args.train_dir =  r'E:\Dropbox\Dropbox\bioInf_lab\AA_info\fastas\NP\SP_Cleaved+NP+Neg_Big'
        args.train_dir =  r'E:\Dropbox\Dropbox\bioInf_lab\AA_info\fastas\Benchmarks\Thermophiles'
        print("Using default train_dir: %s" % args.train_dir)

    pandas.set_option('display.max_columns', 10)
    pandas.set_option('display.max_rows', 4)
    # mpl.rc('title', labelsize=6)
    mpl.rc('ytick', labelsize=7)
    mpl.rc('xtick', labelsize=4)

    os.chdir(args.train_dir)
    dataName = 'Neuropeptides'

    df = pandas.read_csv('trainingSetFeatures.csv')
    feature_cols = [col for col in df.columns if col not in ['classname','Id','proteinname']]
    feature_cols=numpy.array(feature_cols)

    X = df[feature_cols].values
    y = df.classname.values

    le = LabelEncoder()
    y = le.fit_transform(y)

    "Initial feature selection trimming"
    print(X.shape)

    Fwe = SelectFwe(alpha=0.01).fit(X,y)
    X=Fwe.transform(X)
    print("F-test -> ",X.shape)
    feature_cols=feature_cols[Fwe.get_support()]
    '''
    FeatSelection_SVM = True
    if FeatSelection_SVM == True:
        svc_L1 = LinearSVC(C=50, penalty="l1", dual=False,class_weight='auto').fit(X, y)
        X = svc_L1.transform(X, y)
        print ("L1 SVM Transformed X:",X_L1.shape)
        feature_cols=feature_cols[list(set(np.where(svc_L1.coef_ != 0)[-1]))]
    '''


    k = SelectKBest(k=255).fit(X,y)
    X=k.transform(X)
    feature_cols=feature_cols[k.get_support()]


    param_dist = {"max_depth": [6,9, None],
                  "max_features": ['auto',0.4],
                  "min_samples_leaf": [1,2,3],
                  "bootstrap": [True, False],
                  'min_samples_split':[2,3],
                  "criterion": [ "gini"],
                  "n_estimators":[100],
                  "n_jobs":[-1]}

    rf = RandomForestClassifierWithCoef(max_depth= 7, min_samples_split= 1, min_samples_leaf= 2, n_estimators= 50,  n_jobs= 2, max_features= "auto")

    "WARNING! F1 Score as implemented by Default in binary classification (two classes) gives the score for 1 class."

    scores = cross_validation.cross_val_score(rf,X,y,n_jobs=-1,cv=cross_validation.StratifiedShuffleSplit(y,n_iter=8,test_size=0.2))
    print("X RF Accuracy: %0.3f (+- %0.2f)" % (scores.mean(), scores.std() * 2))
    "Instead of scores_f1, we could also use precision, sensitivity, MCC (if binary), etc'."
    scores_f1 = cross_validation.cross_val_score(rf,X,y,n_jobs=-1,cv=cross_validation.StratifiedShuffleSplit(y,n_iter=8,test_size=0.2),scoring='f1')
    print("X RF f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))

    # rfeSelect = RFE(estimator=rf,n_features_to_select=16, step=0.04)
    rfeSelect = RFECV(estimator=rf,step=20, cv=2,scoring='f1') #average_precision , recall
    X_RFE = rfeSelect.fit_transform(X,y)
    print(X_RFE.shape)

    RFE_FeatureNames = feature_cols[rfeSelect.get_support()]
    print(RFE_FeatureNames)

    RFE_ScoreRatio = 100*(cross_validation.cross_val_score(rf,X_RFE,y,n_jobs=-1,cv=cross_validation.StratifiedShuffleSplit(y,n_iter=8,test_size=0.2),scoring='f1').mean())/scores_f1.mean()
    print("Even with just",X_RFE.shape[1]," features, we have %f performance! (f1 score ratio)" %(RFE_ScoreRatio))

    # PlotFeaturesImportance(X_RFE, y, RFE_FeatureNames, dataName)
    print("Alt plot:")
    altPlotFeaturesImportance(X_RFE, y, RFE_FeatureNames, dataName)
コード例 #33
0
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

df1 = pd.read_csv('EventDetectionData.csv')
scores = []
for i in range(150, 200):
    score = []
    X_train, X_test, y_train, y_test = train_test_split(
        df1.iloc[:, 1:i], df1['target'], test_size=0.3,
        random_state=69)  # 70% training and 30% test

    log = LogisticRegression()

    rfecv = RFECV(estimator=log, step=1, cv=5, scoring='roc_auc')

    X_train_new = rfecv.fit_transform(X_train, y_train)
    X_test_new = rfecv.transform(X_test)

    j = rfecv.n_features_

    C_range = 10.**np.arange(-5, 1)

    penalty_options = ['l1', 'l2']

    param_grid = dict(C=C_range, penalty=penalty_options)

    grid = GridSearchCV(log, param_grid, cv=5, scoring='roc_auc')

    grid.fit(X_train_new, y_train)

    y_train_pred = grid.predict(X_train_new)
            Caracteristicas = Datos['CaracteristicasD']

        etiqueta = Datos['etiquetas']
        etiquetas = etiqueta.reshape((etiqueta.shape[0]))

        Resultados = np.zeros(shape=(4, 5))
        ResultadosSTD = np.zeros(shape=(4, 5))
        ResultadosCompletos = np.zeros(shape=(4, 5, grupos))
        start_time = time.time()

        for vent in range(4):
            desc = 1
            Caracteristicas_vent = Caracteristicas['vent'][0, desc][
                0, vent]  #[0,desc,0,vent]
            Car_dff = pd.DataFrame(Caracteristicas_vent)
            Car_dfS = selector_rfecv.fit_transform(Car_dff, etiquetas)
            Car_df = lda.fit_transform(Car_dfS, etiquetas)

            ############ KNN
            clasificador_knn = KNeighborsClassifier(n_neighbors=10,
                                                    weights="uniform")
            accuracy_knn = cross_val_score(clasificador_knn,
                                           X=Car_df,
                                           y=etiquetas,
                                           scoring='accuracy',
                                           cv=grupos,
                                           n_jobs=-1)
            ResultadosCompletos[vent, 0, :] = accuracy_knn
            Resultados[vent, 0] = accuracy_knn.mean()
            ResultadosSTD[vent, 0] = accuracy_knn.std()
コード例 #35
0
ファイル: feature_action.py プロジェクト: iihcy/Rob
class MyApp(QtGui.QMainWindow, Ui_MainWindow):
    
    def __init__(self):
        self.x_data = list()
        self.y_data = list()
        QtGui.QMainWindow.__init__(self)
        Ui_MainWindow.__init__(self)
        self.setupUi(self)
        self.rfwlv_action.clicked.connect(self.rfwlv)
        self.ufs_action.clicked.connect(self.ufs)
        self.rfe_action.clicked.connect(self.rfe)
        #对标准化radio加入组bg中
        self.bg01 = QtGui.QButtonGroup()
        self.bg01.addButton(self.s_radio_1,1)
        self.bg01.addButton(self.s_radio_2,2)
        #默认定义s_radio_1这个控件被选中
        self.s_radio_1.setChecked(True)
        #对数据集划分radio加入组bg中
        self.bg02 = QtGui.QButtonGroup()
        self.bg02.addButton(self.d_radio_1,1)
        self.bg02.addButton(self.d_radio_2,2)
        #默认定义s_radio_1这个控件被选中
        self.d_radio_1.setChecked(True)
        
    def rfwlv(self):
        self.bz()    #标准化
        self.stt()   #划分数据集
        self.dtc01()   #
    
    def ufs(self):
        self.bz()    #标准化
        self.stt()   #划分数据集
        self.dtc03()   #    
    
    def rfe(self):
        self.bz()    #标准化
        self.stt()   #划分数据集
        self.dtc04()   #    
    
    #数据标准化
    def bz(self):
        if self.bg01.checkedId() == 1:
            self.x = preprocessing.scale(self.x_data)
        else:
            min_max_scaler = preprocessing.MinMaxScaler()
            self.x = min_max_scaler.fit_transform(self.x_data)
    
    #训练数据和测试数据的划分
    def stt(self):
        #对数据进行划分,其中自变量和因变量都进行
        #这样就产生四个数据集:x_train,x_test,y_train,y_test
        self.x_train = list()
        self.x_test = list()
        self.y_train = list()
        self.y_test = list()
        if self.bg02.checkedId() == 1:
            strte = self.tt_box.itemText(self.tt_box.currentIndex())
            s01 = str(strte).split(':')
            if len(s01) == 2:
                xnum = math.ceil((int(s01[0])*1.0/10)*len(self.x_data))
                
                for i in range(len(self.x_data)):
                    if i <= xnum:
                        self.x_train.append(self.x_data[i])
                        self.y_train.append(self.y_data[i])
                    else:
                        self.x_test.append(self.x_data[i])
                        self.y_test.append(self.y_data[i])
        else:
            ts01 = int(self.train.text())
            ts02 = int(self.test.text())
            for i in range(ts01+ts02):
                if i < ts01:
                    self.x_train.append(self.x_data[i])
                    self.y_train.append(self.y_data[i])
                else:
                    self.x_test.append(self.x_data[i])
                    self.y_test.append(self.y_data[i])
    
    '''
    主函数
    '''
    def dtc01(self):
        #将y转化为一维形式:self.y_train,self.y_test
        self.y01_train = list()
        self.y01_test = list()
        for a in range(len(self.y_train)):
            self.y01_train.append(self.y_train[a][0])
        for b in range(len(self.y_test)):
            self.y01_test.append(self.y_test[b][0])
            
        #取出其中labels
        self.labels = list()
        for c in range(len(self.y_test)):
            if self.labels.count(self.y_test[c][0]) == 0:
                self.labels.append(self.y_test[c][0])
        print (self.labels)
        
        # VarianceThreshold算法的实现
        # 参数的获取
        if not self.th_edit.text().strip():
            self.max_depth = 0.0
        else:
            self.max_depth = float(self.md_edit.text())
        
        # 定义模型
        self.clf = VarianceThreshold(threshold=self.max_depth) 
        self.clf.fit_transform(self.x_train)
        self.f_c = self.clf.get_support()
        '''
        该模块是对dtable01模块进行设置,即显示训练集的训练结果
        '''
        # VarianceThreshold算法的结果显示
        self.rfwlv_dtable.setRowCount(2)
        self.rfwlv_dtable.setColumnCount(len(self.x_train[0]))
        mlan = "是否保留该特征(T/F)"
        self.rfwlv_dtable.setSpan(0, 0, 1, len(self.x_train[0]))
        self.rfwlv_dtable.setItem(0,0, QtGui.QTableWidgetItem(mlan.decode('utf-8')))
        for j in range(len(self.f_c)):
            self.rfwlv_dtable.setItem(1,j, QtGui.QTableWidgetItem(str(self.f_c[j])))
    
    
    def dtc03(self):
        #将y转化为一维形式:self.y_train,self.y_test
        self.y01_train = list()
        self.y01_test = list()
        for a in range(len(self.y_train)):
            self.y01_train.append(self.y_train[a][0])
        for b in range(len(self.y_test)):
            self.y01_test.append(self.y_test[b][0])
            
        #取出其中labels
        self.labels = list()
        for c in range(len(self.y_test)):
            if self.labels.count(self.y_test[c][0]) == 0:
                self.labels.append(self.y_test[c][0])
        print (self.labels)

        # SelectKBest算法的实现
        # 参数的获取
        if not self.kedit.text().strip():
            self.k = 10
        else:
            self.k = int(self.kedit.text())
        
        if not self.pedit.text().strip():
            self.param = 1e-05
        else:
            self.param = float(self.pedit.text())
        
        self.mode = self.mo_box.itemText(self.mo_box.currentIndex())
        
        # 定义模型
        if self.sp_box.itemText(self.sp_box.currentIndex()) == 'SelectKBest':
            self.clf = SelectKBest(score_func= f_classif,  k=self.k) 
            self.clf.fit_transform(self.x_train,self.y01_train)
            self.f_c = self.clf.get_support()
        elif self.sp_box.itemText(self.sp_box.currentIndex()) == 'SelectPercentile':
            self.clf = SelectPercentile(score_func= f_classif,  percentile= self.k) 
            self.clf.fit_transform(self.x_train,self.y01_train)
            self.f_c = self.clf.get_support()
        else:
            self.clf = GenericUnivariateSelect(score_func= f_classif, mode= self.mode, param=self.param) 
            self.clf.fit_transform(self.x_train,self.y01_train)
            self.f_c = self.clf.get_support()
        # 
        '''
        该模块是对dtable01模块进行设置,即显示训练集的训练结果
        '''
        # VarianceThreshold算法的结果显示
        self.ufs_dtable.setRowCount(2)
        self.ufs_dtable.setColumnCount(len(self.x_train[0]))
        mlan = "是否保留该特征(T/F)"
        self.ufs_dtable.setSpan(0, 0, 1, len(self.x_train[0]))
        self.ufs_dtable.setItem(0,0, QtGui.QTableWidgetItem(mlan.decode('utf-8')))
        for j in range(len(self.f_c)):
            self.ufs_dtable.setItem(1,j, QtGui.QTableWidgetItem(str(self.f_c[j])))
        
    def dtc04(self):
        #将y转化为一维形式:self.y_train,self.y_test
        self.y01_train = list()
        self.y01_test = list()
        for a in range(len(self.y_train)):
            self.y01_train.append(self.y_train[a][0])
        for b in range(len(self.y_test)):
            self.y01_test.append(self.y_test[b][0])
            
        #取出其中labels
        self.labels = list()
        for c in range(len(self.y_test)):
            if self.labels.count(self.y_test[c][0]) == 0:
                self.labels.append(self.y_test[c][0])
        print (self.labels)
        
        # VarianceThreshold算法的实现
        # 参数的获取
        if not self.stepedit.text().strip():
            self.step = 1
        else:
            self.step = int(self.stepedit.text())
        
        if not self.cvedit.text().strip():
            self.cv = 5
        else:
            self.cv = int(self.cvedit.text())
        # 定义模型
        estimator = SVR(kernel="linear")
        self.clf = RFECV(estimator, step=self.step, cv=self.cv)
        self.clf.fit(self.x_train,self.y01_train)
        
        self.f_c = self.clf.get_support()
        '''
        该模块是对dtable01模块进行设置,即显示训练集的训练结果
        '''
        # VarianceThreshold算法的结果显示
        self.rfe_dtable.setRowCount(2)
        self.rfe_dtable.setColumnCount(len(self.x_train[0]))
        mlan = "是否保留该特征(T/F)"
        self.rfe_dtable.setSpan(0, 0, 1, len(self.x_train[0]))
        self.rfe_dtable.setItem(0,0, QtGui.QTableWidgetItem(mlan.decode('utf-8')))
        for j in range(len(self.f_c)):
            self.rfe_dtable.setItem(1,j, QtGui.QTableWidgetItem(str(self.f_c[j])))
        
    #保存模型
    def out_model(self):
        self.filepath=str(QtGui.QFileDialog.getSaveFileName(self,"文件保存","F:/","Model Files (*.model)"))
        joblib.dump(self.clf, self.filepath.decode('GB2312'))
コード例 #36
0
def main(args):
    if args.train_dir is None:
        # args.train_dir = '/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/chap/train/'
        #args.train_dir = '/cs/prt3/danofer/ProtFeat/feat_extract/test_seq/NP/SPCleaved_NP-70+NEG-30_Big-V3/'
        #        args.train_dir =  r'D:\SkyDrive\Dropbox\bioInf_lab\AA_info\CODE\feat_extract\test_seq\NP\SPCleaved_NP-70+NEG-30_Big-V3'
        # args.train_dir =  r'E:\Dropbox\Dropbox\bioInf_lab\AA_info\fastas\NP\SP_Cleaved+NP+Neg_Big'
        args.train_dir = r'E:\Dropbox\Dropbox\bioInf_lab\AA_info\fastas\Benchmarks\Thermophiles'
        print("Using default train_dir: %s" % args.train_dir)

    pandas.set_option('display.max_columns', 10)
    pandas.set_option('display.max_rows', 4)
    # mpl.rc('title', labelsize=6)
    mpl.rc('ytick', labelsize=7)
    mpl.rc('xtick', labelsize=4)

    os.chdir(args.train_dir)
    dataName = 'Neuropeptides'

    df = pandas.read_csv('trainingSetFeatures.csv')
    feature_cols = [
        col for col in df.columns
        if col not in ['classname', 'Id', 'proteinname']
    ]
    feature_cols = numpy.array(feature_cols)

    X = df[feature_cols].values
    y = df.classname.values

    le = LabelEncoder()
    y = le.fit_transform(y)

    "Initial feature selection trimming"
    print(X.shape)

    Fwe = SelectFwe(alpha=0.01).fit(X, y)
    X = Fwe.transform(X)
    print("F-test -> ", X.shape)
    feature_cols = feature_cols[Fwe.get_support()]
    '''
    FeatSelection_SVM = True
    if FeatSelection_SVM == True:
        svc_L1 = LinearSVC(C=50, penalty="l1", dual=False,class_weight='auto').fit(X, y)
        X = svc_L1.transform(X, y)
        print ("L1 SVM Transformed X:",X_L1.shape)
        feature_cols=feature_cols[list(set(np.where(svc_L1.coef_ != 0)[-1]))]
    '''

    k = SelectKBest(k=255).fit(X, y)
    X = k.transform(X)
    feature_cols = feature_cols[k.get_support()]

    param_dist = {
        "max_depth": [6, 9, None],
        "max_features": ['auto', 0.4],
        "min_samples_leaf": [1, 2, 3],
        "bootstrap": [True, False],
        'min_samples_split': [2, 3],
        "criterion": ["gini"],
        "n_estimators": [100],
        "n_jobs": [-1]
    }

    rf = RandomForestClassifierWithCoef(max_depth=7,
                                        min_samples_split=1,
                                        min_samples_leaf=2,
                                        n_estimators=50,
                                        n_jobs=2,
                                        max_features="auto")

    "WARNING! F1 Score as implemented by Default in binary classification (two classes) gives the score for 1 class."

    scores = cross_validation.cross_val_score(
        rf,
        X,
        y,
        n_jobs=-1,
        cv=cross_validation.StratifiedShuffleSplit(y, n_iter=8, test_size=0.2))
    print("X RF Accuracy: %0.3f (+- %0.2f)" %
          (scores.mean(), scores.std() * 2))
    "Instead of scores_f1, we could also use precision, sensitivity, MCC (if binary), etc'."
    scores_f1 = cross_validation.cross_val_score(
        rf,
        X,
        y,
        n_jobs=-1,
        cv=cross_validation.StratifiedShuffleSplit(y, n_iter=8, test_size=0.2),
        scoring='f1')
    print("X RF f1: %0.3f (+- %0.2f)" %
          (scores_f1.mean(), scores_f1.std() * 2))

    # rfeSelect = RFE(estimator=rf,n_features_to_select=16, step=0.04)
    rfeSelect = RFECV(estimator=rf, step=20, cv=2,
                      scoring='f1')  #average_precision , recall
    X_RFE = rfeSelect.fit_transform(X, y)
    print(X_RFE.shape)

    RFE_FeatureNames = feature_cols[rfeSelect.get_support()]
    print(RFE_FeatureNames)

    RFE_ScoreRatio = 100 * (cross_validation.cross_val_score(
        rf,
        X_RFE,
        y,
        n_jobs=-1,
        cv=cross_validation.StratifiedShuffleSplit(y, n_iter=8, test_size=0.2),
        scoring='f1').mean()) / scores_f1.mean()
    print(
        "Even with just", X_RFE.shape[1],
        " features, we have %f performance! (f1 score ratio)" %
        (RFE_ScoreRatio))

    # PlotFeaturesImportance(X_RFE, y, RFE_FeatureNames, dataName)
    print("Alt plot:")
    altPlotFeaturesImportance(X_RFE, y, RFE_FeatureNames, dataName)
コード例 #37
0
def preProcess(theFileName):
    df = pd.read_csv(str(theFileName))
    if 'Unnamed: 0' in df.columns:
        df = df.drop('Unnamed: 0', axis = 1)
    labBin = sklearn.preprocessing.LabelBinarizer()
    df['y'] = labBin.fit_transform(df['y'])
    dp = pd.get_dummies(df)
    X = dp.drop('y', axis = 1) 
    y = dp[['y']]

    # get the features
    theFeatures = X.columns

    # convert the dataframes to arrays
    X = X.values
    y = y.values
    y.shape = np.shape(y)[0]

    yOrig = y[:] # need this later for plotting feature impacts

    # and carry out feature scaling
    X = StandardScaler().fit_transform(X)

    #=======================================================================

    # apply random undersampling if labels are imbalanced
    labelSkewness = 100*np.sum(y)*1./np.shape(y)[0]
    if np.min([labelSkewness, 100-labelSkewness]) < (100./3.):
        rus = RandomUnderSampler(verbose=0)
        X, y = rus.fit_sample(X, y)

    #=======================================================================

    # select optimal number of features
    thisModel = LogisticRegression(penalty='l1', C=1)
    rfecv = RFECV(estimator=thisModel, step=1, cv=StratifiedKFold(y, n_folds=3), scoring='f1')
    Xt = rfecv.fit_transform(X, y);

    optimalNumberOfFeatures = rfecv.n_features_
    introReport = ['Optimal Number of Attributes: ' + str(optimalNumberOfFeatures), 'The following attributes are the most influential to the outcome']

    #=======================================================================

    # plot number of selected features VS cross-validation scores
    plt.figure(figsize=(12, 8))

    plt.xlabel("Number of Attributes", fontsize=20)
    plt.ylabel("Score", fontsize=20)
    plt.title("Attribute Selection", fontsize=25)
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)

    imgOne = 'static/thePlot.jpg'
    plt.savefig('flask_files/'+imgOne, dpi=300)
    
    #=======================================================================

    # get the feature feature importance rankings
    model = RandomForestClassifier(n_estimators=300)
    model.fit(X,y)
    theImportances = list(model.feature_importances_)
    sortedImportances = sorted(theImportances,reverse = True)

    # ...and print the selected features along with their weights and ranks
    tableOne = []
    for ii in range(1,optimalNumberOfFeatures+1):
        tableOne.append(dict(Feature = str(theFeatures[theImportances.index(sortedImportances[ii-1])]), Weight = str(sortedImportances[ii-1]), Rank = str(ii)))

    #=======================================================================

    # plot histogram of the most important feature
    thisFeature = 0
    allThoseFeatures = dp[theFeatures[theImportances.index(sortedImportances[thisFeature])]]

    plt.figure(figsize=(12, 8))
    
    combinedOutcomes = plt.hist(allThoseFeatures, bins=10)

#    plt.hist(allThoseFeatures, bins=10)
    plt.xlabel('Attribute: ' + theFeatures[theImportances.index(sortedImportances[0])], fontsize=20)
    plt.ylabel('Count', fontsize=20)
    plt.title('Impact of the Most Influential Attribute', fontsize=25)

    imgTwo = 'static/theHist.jpg'
    plt.savefig('flask_files/'+imgTwo, dpi=300)

    #=======================================================================

    # plot impact of the most important feature
    positiv = allThoseFeatures[yOrig==1]
    negativ = allThoseFeatures[yOrig==0]

    plt.figure(figsize=(12, 8))
    
    negA = plt.hist(negativ,bins=combinedOutcomes[1])
    posA = plt.hist(positiv,bins=combinedOutcomes[1])
#    yUpperLimit = np.max([negA[0], posA[0]])*1.01

#    plt.subplot(1,2,1)
#    plt.hist(negativ,bins=combinedOutcomes[1])
#    plt.ylim(ymax = yUpperLimit*1.01, ymin = 0)
#    plt.xlabel(theFeatures[theImportances.index(sortedImportances[thisFeature])], fontsize=16)
#    plt.ylabel('Count', fontsize=16)
#    plt.title('Negative', fontsize=20)
#
#    plt.subplot(1,2,2)
#    plt.hist(positiv,bins=combinedOutcomes[1])
#    plt.ylim(ymax = yUpperLimit, ymin = 0)
#    plt.xlabel(theFeatures[theImportances.index(sortedImportances[thisFeature])], fontsize=16)
#    plt.title('Positive',fontsize=20)
#
#    imgThree = 'static/theNegPosHist.jpg'
#    plt.savefig('flask_files/'+imgThree, dpi=300)

    #=======================================================================
    
    a = posA[0]
    b = negA[0]
    c = combinedOutcomes[0]

    posImpact = np.divide(a,c)
    negImpact = np.divide(b,c)

    midPoints=[]
    for i in range(1,len(combinedOutcomes[1])):
        midPoints.append((combinedOutcomes[1][i] + combinedOutcomes[1][i-1])/2.)

    for i in range(len(posImpact)):
        if np.isnan(posImpact[i]):
            posImpact[i]=0
        if np.isnan(negImpact[i]):
            negImpact[i]=0

    plt.figure(figsize=(12, 8))
    plt.hold(True)
    plt.plot(midPoints, posImpact,'.', markersize=20, label='Positive')
    plt.plot(midPoints, negImpact, 'r.', markersize=20, label='Negative')
    plt.legend(prop={'size':20})
    plt.xlabel(theFeatures[theImportances.index(sortedImportances[thisFeature])], fontsize=16)
    plt.ylabel('Relative Impact', fontsize=20)
    plt.grid()

    imgThree = 'static/theNegPosHist.jpg'
    plt.savefig('flask_files/'+imgThree, dpi=300)

    #=======================================================================

    # generate plots for report (this is save to an "html" file)

    from bokeh.charts import Histogram, output_file, show, save, gridplot
    from bokeh.plotting import figure

    plotList=[]

    for i in range(optimalNumberOfFeatures):
        thisFeatureIs = theFeatures[theImportances.index(sortedImportances[i])]
        allThoseFeatures = dp[thisFeatureIs]
        combinedOutcomes = plt.hist(allThoseFeatures, bins=10)
        
        positiv = allThoseFeatures[yOrig==1]
        negativ = allThoseFeatures[yOrig==0]
        negA = plt.hist(negativ,bins=combinedOutcomes[1])
        posA = plt.hist(positiv,bins=combinedOutcomes[1])
        posImpact = np.divide(posA[0],combinedOutcomes[0])
        negImpact = np.divide(negA[0],combinedOutcomes[0])
        
        midPoints=[]
        for i in range(1,len(combinedOutcomes[1])):
            midPoints.append((combinedOutcomes[1][i] + combinedOutcomes[1][i-1])/2.)
        
        for i in range(len(posImpact)):
            if np.isnan(posImpact[i]):
                posImpact[i]=0
            if np.isnan(negImpact[i]):
                negImpact[i]=0

        hist0 = Histogram(dp, values=thisFeatureIs, color='blue', title="Impact of " + thisFeatureIs, bins=10)
        plot0 = figure()
        plot0.xaxis.axis_label = thisFeatureIs
        plot0.yaxis.axis_label = "Relative Impact"
        #     plot0.title = "Relative Impact of " + thisFeatureIs
        plot0.circle(midPoints, list(negImpact), size=10, color="red", alpha=0.9, legend='Negative')
        plot0.circle(midPoints, list(posImpact), size=10, color="green", alpha=0.9, legend='Positive')
        plotList.append([hist0,plot0])

    output_file("flask_files/static/Report.html", title = "Report")
    hist = gridplot(plotList)
    save(hist)

    #=======================================================================

    # specify the models to run tests with
    theModels = {'Logistic Regression':LogisticRegression(penalty='l1'), 'LDA':LinearDiscriminantAnalysis(), 'SVM':SVC(kernel='linear'), 'Random Forest':RandomForestClassifier(n_estimators=300)}

    # ...then display the results of the tests
    classifierComparisons=[]
    for aModel in theModels:
        model = theModels[aModel]
        results = cross_validation.cross_val_score(model, Xt, y, scoring='f1', cv=StratifiedKFold(y, n_folds=3))
        classifierComparisons.append(dict(Classifier = aModel, Score = np.max(results)))

    #=======================================================================

    # display the plots
    theJPGs = [imgOne, imgTwo, imgThree]

    #=======================================================================

    return introReport, tableOne, optimalNumberOfFeatures, classifierComparisons, theJPGs
コード例 #38
0
ファイル: PipeTasks.py プロジェクト: Sandy4321/ProFET
        if FeatSelection_RFECV==True:
            rfecv = RFECV(estimator=svc, step=0.1,
                         cv=StratifiedShuffleSplit(y,n_iter=7,test_size=0.33),
                         scoring='f1',verbose=0)
            # " scoring='roc_auc','recall','f1'..."
        else:
            rfecv = RFE(estimator=svc,n_features_to_select=RFE_FeatsToKeep, step=0.1)
        rfecv.fit(X, y)
        if FeatSelection_RFECV==True:
            print("RFEcv selected %d number of Optimal features : " % (rfecv.n_features_))
        print("RFE (%d Features) scorer : \n" % (rfecv.n_features_),rfecv.score(X, y) )
        print("RFE selected feature names:")
        featureNames=featureNames[rfecv.get_support()]
        rfe_featnames = featureNames[rfecv.get_support()]
        print (rfe_featnames)
        X_RFE = rfecv.fit_transform(X, y)
        print(X_RFE.shape,"X_RFE \n")

        'Set GetRFEPerf To true or by user, if perf. of reduced set wanted'
        GetRFEPerf=False


    print("\n X: \n")
    ModelParam_GridSearch(X,y,cv=4)

    if GetRFEPerf==True:
        print("\n X-RFE: \n")
        ModelParam_GridSearch(X_RFE,y,cv=4)

    GetPCAPerf=False
    if GetPCAPerf==True:
コード例 #39
0
ファイル: OutPutRes.py プロジェクト: MichaelDoron/ProFET
def GetAllPerf (filePaths=None):
    if filePaths is None:
        filePaths = list(find_files(directory='./test_seq', pattern='trainingSetFeatures.csv'))

    #Sanity check:
    # filePaths=['/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/test_seq/Thermophile']
    # filePaths=['./test_seq/NP/NP2/Train/trainingSetFeatures.csv']

    print("FilePaths: \n",filePaths)
    fileNames=fileNameFromPaths (filePaths)
    print("FileNames:",fileNames)


    resDict = pd.DataFrame(index=fileNames,
        columns=['Accuracy','Accuracy_SD',
        'f1','f1_SD','dummy_freq:Accuracy','dummy_freq:f1',
        'LargestClassPercent','Classes',
        # 'TopRFE-Features','Best (f1) Model parameters',
         '# Classes',
         'Array-Acc-Scores' ,'Array-f1-Scores'
         ,'bestML-Acc','bestML-f1','dummy_freq_f1_weighted'])


    #redDict holds results for each file/class, for saving to output-file

    i=-1
    for filePath in filePaths:
        i +=1

        'http://pythonconquerstheuniverse.wordpress.com/2008/06/04/gotcha-%E2%80%94-backslashes-in-windows-filenames/'
        filePath = os.path.normpath(filePath)
        print(filePath)
        fileName=str(fileNames[i]) #Str added now 14.1

        print("fileName: %s" %(fileName))
        "resDict['Name']= fileName"

        # filePath = str(argv[1])
        # X, y, lb_encoder,featureNames = load_data(filePath+fileName, 'file') # X, y = features, labels
        X, y, lb_encoder,featureNames = load_data(filePath, 'file') # X, y = features, labels
        print(X.shape,"= (samples, features)")
        y_inv = Counter(lb_encoder.inverse_transform(y))
        MajorityPercent = round(100*y_inv.most_common()[0][1]/sum(y_inv.values()),1)
        print("Classes:", lb_encoder.classes_)
        print("MajorityClassPercent:", MajorityPercent)

        resDict.LargestClassPercent[fileName] = MajorityPercent
        resDict.Classes[fileName] = str(lb_encoder.classes_)
        resDict["# Classes"][fileName]=len(lb_encoder.classes_)

        KFilt=None
        KFilt=350  #This is just temporary for the outputs - saves computation time. Barely filters compared to the model itself.

        if KFilt is not None:
            k = SelectKBest(k=KFilt).fit(X,y)
            X=k.transform(X)
            featureNames=featureNames[k.get_support()]

        Fwe = SelectFwe(alpha=0.01).fit(X,y)
        X=Fwe.transform(X)
        featureNames=featureNames[Fwe.get_support()]

        print("X reduced to K best features: ",X.shape)


        FeatSelection_SVM=False #Feature Names need updating!!
        FeatSelection_RandLogReg=False

        if FeatSelection_RandLogReg == True:
            LogRegFeats = RandomizedLogisticRegression(C=10, scaling=0.5,
             sample_fraction=0.95, n_resampling=40, selection_threshold=0.2,n_jobs=-1).fit(X,y)
            X_L1 = LogRegFeats.transform(X)
            featureNames=featureNames[LogRegFeats.get_support()]
            print("RandomizedLogisticRegression Feature Selection ->:",X_L1.shape)

        elif FeatSelection_SVM == True:
            svc_L1= LinearSVC(C=30, penalty="l2", dual=False,class_weight='auto').fit(X, y)
            X_L1 = svc_L1.transform(X, y)
            featureNames=featureNames[list(set(np.where(svc_L1.coef_ != 0)[-1]))]
            print ("L1 SVM Transformed X:",X_L1.shape)
        # X=X_L1

        '''
        print("Performance as a function of percent of features used:")
        PlotPerfPercentFeatures(X,y,est=LinearSVC())
        '''

        'EG - graph best features; feature selection using RF, ensemble classifiers..'
        'http://nbviewer.ipython.org/github/herrfz/dataanalysis/blob/master/assignment2/samsung_data_prediction_submitted.ipynb'

        RFE_FeatsToKeep = 16
        FeatSelection_RFE=False
        FeatSelection_RFECV=False

        if (FeatSelection_RFE or FeatSelection_RFECV) == True:
            'RFE + - best feats'
            'http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html '
            svc = LinearSVC(class_weight='auto')#,penalty='l1',dual=False)
            # svc = LogisticRegression(class_weight='auto')#,C=1)

            if FeatSelection_RFECV==True:
                rfecv = RFECV(estimator=svc, step=RFE_FeatsToKeep,scoring='average_precision')
                             # ,cv=StratifiedShuffleSplit(y,n_iter=3,test_size=0.3))
                             #,scoring='f1',verbose=0) # " scoring='roc_auc','recall','f1',accuracy..."
            else:
                rfecv = RFE(estimator=svc,n_features_to_select=RFE_FeatsToKeep, step=0.03)
            rfecv.fit(X, y)
            if FeatSelection_RFECV==True:
                print("RFE-CV selected %d features : " % (rfecv.n_features_))
            print("RFE (%d features) scorer : " % (rfecv.n_features_),rfecv.score(X, y) )
            rfe_featnames = featureNames[rfecv.get_support()]
            featureNames = featureNames[rfecv.get_support()]
            print("RFE selected feature names:",rfe_featnames)
            X_RFE = rfecv.fit_transform(X, y)
            print("X_RFE",X_RFE.shape)

            resDict['TopRFE-Features'][fileName]=str(rfe_featnames)

            'Set GetRFEPerf To true or by user, if perf. of reduced set wanted'
        GetRFEPerf=False

        # print("lb_encoder.classes_",lb_encoder.classes_)
        'Blind score boxplot graphic example using Seaborn: http://nbviewer.ipython.org/github/cs109/2014/blob/master/homework-solutions/HW5-solutions.ipynb '
        'Confusion matrixes + Dummies - http://bugra.github.io/work/notes/2014-11-22/an-introduction-to-supervised-learning-scikit-learn/'
        'http://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators'

        "http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html"
        print()

        "Make custom F1 scorer. May not have fixed problem!"
        from sklearn.metrics.score import make_scorer
        f1_scorer = make_scorer(metrics.f1_score,
                     greater_is_better=True, average="micro") #Maybe another metric? May NOT be fixed!?. #weighted, micro, macro, none

        # print("Dummy classifiers output:")

        dummy_frequent = DummyClassifier(strategy='most_frequent',random_state=0)
        y_dummyPred = Get_yPred(X,y,clf_class=dummy_frequent)
        dummy_freq_acc = '{:.3}'.format(metrics.accuracy_score(y,y_dummyPred ))
        dummy_freq_f1 = '{:.3}'.format(metrics.f1_score(y, y_dummyPred,average='weighted'))

        dummy_freq_f1_weighted = '{:.3}'.format(f1_scorer(y, y_dummyPred))
        #Get from ALL classes f1..
        dummy_freq_f1_mean=(metrics.f1_score(y, y_dummyPred,average=None)).mean()
        # print("Dummy, most frequent acc:",dummy_freq_acc)

        # dummy_stratifiedRandom = DummyClassifier(strategy='stratified',random_state=0)
        # dummy_strat2= '{:.3%}'.format(metrics.accuracy_score(y, Get_yPred(X,y,clf_class=dummy_frequent))) #,sample_weight=balance_weights(y)))
        # 'print("Dummy, Stratified Random:",dummy_strat2)'
        print()

        resDict['dummy_freq:Accuracy'][fileName]=dummy_freq_acc
##        resDict['dummy_freq:f1'][fileName]=dummy_freq_f1 dummy_freq_f1_mean
        resDict['dummy_freq:f1'][fileName]=dummy_freq_f1_mean

        resDict['dummy_freq_f1_weighted'][fileName]=dummy_freq_f1_weighted
        # resDict.dummy_Stratfreq[fileName]=dummy_strat2

        "We can get seperately the best model for Acc, and the best for f1!"
        "WARNING!? In binary case - default F1 works for the 1 class, in sklearn 15. and lower"
        # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')
        "Temporary workaround until next SKlearn update of F1 metric:"
        # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')f1_scorer
        bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = f1_scorer)

        bestEst_acc,bestScore_acc = ModelParam_GridSearch(X,y,cv=2,scoreParam = 'accuracy')
        print("bestEst (f1):",bestEst_f1)#,"best f1",bestScore_f1)
        print("bestEst (f1):",bestEst_acc)#,"best acc",bestScore_acc)

        #Temp
        # bestEst_f1=bestEst_acc=bestEst = RandomForestClassifier(n_jobs=-1)

        if GetRFEPerf==True:
            bestEst_RFE,bestScore_RFE = ModelParam_GridSearch(X_RFE,y,cv=3,scoreParam = 'f1')

        "Modified to get 2 estimators"
        scores_acc = cross_val_score(estimator=bestEst_acc, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1) #Accuracy
        print("Accuracy: %0.3f (+- %0.2f)" % (scores_acc.mean(), scores_acc.std() * 2))
        scores_f1 = cross_val_score(estimator=bestEst_f1, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1, scoring='f1')
        print("f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))

        resDict['Accuracy'][fileName]=round(scores_acc.mean(),4)
        resDict['Accuracy_SD'][fileName]=round(scores_acc.std(),4)
        resDict['f1'][fileName]=round(scores_f1.mean(),4)
        resDict['f1_SD'][fileName]=round(scores_f1.std(),4)
        resDict['Array-f1-Scores'][fileName]=(scores_f1)
        resDict['Array-Acc-Scores'][fileName]=(scores_acc)
        resDict['bestML-f1'][fileName]=(str(bestEst_f1))
        resDict['bestML-Acc'][fileName]=(str(bestEst_acc))

        #ORIG
        # Acc,Acc_SD,f1,f1_SD = CV_multi_stats(X, y, bestEst,n=15)

        # resDict['Accuracy'][fileName]=round(Acc,4)
        # resDict['Accuracy_SD'][fileName]=round(Acc_SD,4)
        # resDict['f1 score'][fileName]=round(f1,4)
        # resDict['f1_SD'][fileName]=round(f1_SD,4)
        # resDict['Best (f1) Model parameters'][fileName]= bestEst

        print()
        # print(fileName," Done")

    print("Saving results to file")
    resDict.to_csv("OutputData.tsv", sep=',')
コード例 #40
0
ファイル: data_loader.py プロジェクト: afshinrahimi/telstra
X_test = vectorizer.transform(test_features)

#scaler = prep.MinMaxScaler(feature_range=(0, 1), copy=True)
scaler = prep.StandardScaler(copy=True, with_mean=True, with_std=True)
X_train = scaler.fit_transform(X_train.toarray())
X_test = scaler.transform(X_test.toarray())

do_feature_elimination = False
if do_feature_elimination:
    estimator =  RandomForestClassifier(n_estimators=2000, criterion='entropy', max_depth=None, 
                                 min_samples_split=16, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                                 max_features='auto', max_leaf_nodes=None, bootstrap=False, oob_score=False, 
                                 n_jobs=10, random_state=None, verbose=0, warm_start=False, class_weight=None)
    selector = RFECV(estimator, step=1, cv=5, scoring='log_loss')
    X_train = selector.fit_transform(X_train, train_labels)
    print 'after feature elimination', X_train.shape
    X_test = selector.transform(X_test)
    
do_feature_selection = False
if do_feature_selection:
    ch2 = SelectKBest(chi2, k=4000)
    X_train = ch2.fit_transform(X_train, train_labels)
    X_test = ch2.transform(X_test)

do_pca = False

if do_pca:
    k = 100
    add_pca_to_original = True
    X_train = X_train.toarray()