Exemple #1
0
def test_surf_pandas_inputs():
    """Check: Data (pandas DataFrame/Series): SURF works with pandas DataFrame and Series inputs"""
    np.random.seed(240932)
    clf = make_pipeline(SURF(n_features_to_select=2),
                        RandomForestClassifier(n_estimators=100, n_jobs=-1))
    assert np.mean(cross_val_score(clf, features_df, labels_s, cv=3,
                                   n_jobs=-1)) > 0.7
Exemple #2
0
def test_surf_pipeline_parallel():
    """Check: Data (Binary Endpoint, Discrete Features): SURF works in a sklearn pipeline when SURF is parallelized"""
    np.random.seed(240932)

    clf = make_pipeline(SURF(n_features_to_select=2, n_jobs=-1),
                        RandomForestClassifier(n_estimators=100, n_jobs=-1))

    assert np.mean(cross_val_score(clf, features, labels, cv=3)) > 0.7
Exemple #3
0
def test_surf_pipeline():
    """Ensure that SURF works in a sklearn pipeline when it is parallelized"""
    np.random.seed(240932)

    clf = make_pipeline(SURF(n_features_to_select=2, n_jobs=-1),
                        RandomForestClassifier(n_estimators=100, n_jobs=-1))

    assert np.mean(cross_val_score(clf, features, labels, cv=3)) > 0.7
Exemple #4
0
def test_surf_pipeline_mixed_attributes():
    """Check: Data (Mixed Attributes): SURF works in a sklearn pipeline"""
    np.random.seed(240932)

    clf = make_pipeline(SURF(n_features_to_select=2),
                        RandomForestClassifier(n_estimators=100, n_jobs=-1))

    assert np.mean(cross_val_score(clf, features_mixed_attributes,
                                   labels_mixed_attributes, cv=3, n_jobs=-1)) > 0.7
Exemple #5
0
def test_surf_pipeline_cont_endpoint():
    """Check: Data (Continuous Endpoint): SURF works in a sklearn pipeline"""
    np.random.seed(240932)

    clf = make_pipeline(SURF(n_features_to_select=2),
                        RandomForestRegressor(n_estimators=100, n_jobs=-1))

    assert abs(np.mean(cross_val_score(clf, features_cont_endpoint,
                                       labels_cont_endpoint, cv=3, n_jobs=-1))) < 0.5
Exemple #6
0
def test_surf_pipeline_missing_values():
    """Ensure that SURF works in a sklearn pipeline with missing values"""
    np.random.seed(240932)

    clf = make_pipeline(SURF(n_features_to_select=2, n_jobs=-1), Imputer(),
                        RandomForestClassifier(n_estimators=100, n_jobs=-1))

    assert np.mean(
        cross_val_score(
            clf, features_missing_values, labels_missing_values, cv=3)) > 0.7
Exemple #7
0
def test_surf_init():
    """Check: SURF, SURF*, and MultiSURF constructors store custom values correctly"""
    clf = SURF(n_features_to_select=7,
               discrete_threshold=20,
               verbose=True,
               n_jobs=3)

    assert clf.n_features_to_select == 7
    assert clf.discrete_threshold == 20
    assert clf.verbose == True
    assert clf.n_jobs == 3
Exemple #8
0
def test_surf_pipeline_multiclass():
    """Check: Data (Multiclass Endpoint): SURF works in a sklearn pipeline"""
    np.random.seed(240932)

    clf = make_pipeline(SURF(n_features_to_select=2), SimpleImputer(),
                        RandomForestClassifier(n_estimators=100, n_jobs=-1))

    assert np.mean(
        cross_val_score(
            clf, features_multiclass, labels_multiclass, cv=3,
            n_jobs=-1)) > 0.7
def get_selector(name, estimator=None, n_features_to_select=None, **params):
    if name == 'RobustSelector':
        return RobustSelector(estimator, n_features_to_select=n_features_to_select, **search_dict(params,
         ('cv', 'verbose')))
    elif name == 'MaxFeatures':
        return SelectFromModel(estimator, threshold=-np.inf, max_features=n_features_to_select)
    elif name == 'RandomSubsetSelector':
        return RandomSubsetSelector(estimator, n_features_to_select=n_features_to_select, **search_dict(params,
        ('n_subsets', 'subset_size', 'random_state')))
    elif name == 'FeatureImportanceThreshold':
        return SelectFromModel(estimator, **search_dict(params, 'threshold'))
    elif name == 'RFE':
        return RFE(estimator, n_features_to_select=n_features_to_select, **search_dict(params, 
        ('step', 'verbose')))
    elif name == 'RFECV':
        return RFECV(estimator, n_features_to_select=n_features_to_select, **search_dict(params,
         ('step', 'cv', 'verbose')))
    elif name == 'FoldChangeFilter':
        return FoldChangeFilter(**search_dict(params,
        ('threshold', 'direction', 'below', 'pseudo_count')))
    elif name == 'ZeroFractionFilter':
        return ZeroFractionFilter(**search_dict(params,
        ('threshold',)))
    elif name == 'RpkmFilter':
        return RpkmFilter(**search_dict(params,
        ('threshold',)))
    elif name == 'RpmFilter':
        return RpmFilter(**search_dict(params,
        ('threshold',)))
    elif name == 'DiffExpFilter':
        return DiffExpFilter(max_features=n_features_to_select, **search_dict(params,
        ('threshold', 'script', 'temp_dir', 'score_type', 'method')))
    elif name == 'ReliefF':
        from skrebate import ReliefF
        return ReliefF(n_features_to_select=n_features_to_select,
            **search_dict(params, ('n_jobs', 'n_neighbors', 'discrete_limit')))
    elif name == 'SURF':
        from skrebate import SURF
        return SURF(n_features_to_select=n_features_to_select,
            **search_dict(params, ('n_jobs', 'discrete_limit')))
    elif name == 'MultiSURF':
        from skrebate import MultiSURF
        return MultiSURF(n_features_to_select=n_features_to_select,
            **search_dict(params, ('n_jobs', 'discrete_limit')))
    elif name == 'SIS':
        return SIS(n_features_to_select=n_features_to_select, 
            **search_dict(params, ('temp_dir', 'sis_params')))
    elif name == 'NullSelector':
        return NullSelector()
    else:
        raise ValueError('unknown selector: {}'.format(name))
Exemple #10
0
        GradientBoostingClassifier(random_state=args.randomseed),
        'rf':
        RandomForestClassifier(n_jobs=-1, random_state=args.randomseed),
        'ext':
        ExtraTreesClassifier(n_estimators=100,
                             n_jobs=-1,
                             random_state=args.randomseed),
        'svm':
        SVC(kernel='sigmoid', random_state=args.randomseed)
    }

    # 初始化 fs 字典
    fs = {
        'ReliefF': ReliefF(n_features_to_select=100, verbose=False, n_jobs=-1),
        # 'TuRF': TuRF(core_algorithm="ReliefF", n_features_to_select=100, verbose=False, n_jobs=-1),
        'SURF': SURF(n_features_to_select=100, verbose=False, n_jobs=-1),
        'SURFstar': SURFstar(n_features_to_select=100,
                             verbose=False,
                             n_jobs=-1)
    }

    print('\nClassifier parameters:', clf[args.classifier])

    print('\nStarting cross validating without feature selection...\n')

    # 特征排序前的增量特征预测,根据名称调用字典中指定分类器
    y_pred_list = [
        cross_val_predict(clf[args.classifier],
                          X[:, 0:i + 1],
                          y,
                          cv=args.kfolds,
def relief(X, y):
    np.random.seed(0)
    return SURF().fit(X, y).feature_importances_
Exemple #12
0
def relief(X, y):
    return SURF().fit(X, y).feature_importances_
Exemple #13
0
    def FeatureWeights(self, weights=("pearson", "variance"), **kwargs):
        """
        Calculates the requested weights and log them

        :param weights: a list of weights, a subset of {'pearson', 'variance', 'relieff',
            'surf', 'sobol', 'morris', 'delta_mmnt', 'info-gain'}
        :param kwargs: all input acceptable by ``skrebate.ReliefF``, ``skrebate.surf``,
            ``sensapprx.SensAprx``
        :return: None
        """
        from pandas import DataFrame, read_sql

        self.data = read_sql("SELECT * FROM data", self.conn)
        features = list(self.data.columns)
        features.remove(self.target)
        weights_df = read_sql("SELECT * FROM weights", self.conn)
        if len(weights_df) == 0:
            weights_df = DataFrame({"feature": features})
        X = self.data[features].values
        y = self.data[self.target].values
        n_features = kwargs.get("n_features", int(len(features) / 2))
        domain = None
        probs = None
        regressor = kwargs.get("regressor", None)
        reduce = kwargs.get("reduce", True)
        num_smpl = kwargs.get("num_smpl", 700)
        W = {"feature": features}
        for factor in weights:
            if factor == "pearson":
                Res = dict(self.data.corr(method="pearson").fillna(0)[self.target])
                W["pearson"] = [Res[v] for v in features]
            elif factor == "variance":
                Res = dict(self.data.var())
                W["variance"] = [Res[v] for v in features]
            elif factor == "relieff":
                from skrebate import ReliefF

                n_neighbors = kwargs.get("n_neighbors", 80)
                RF = ReliefF(n_features_to_select=n_features, n_neighbors=n_neighbors)
                RF.fit(X, y)
                W["relieff"] = [
                    RF.feature_importances_[features.index(v)] for v in features
                ]
            elif factor == "surf":
                from skrebate import SURF

                RF = SURF(n_features_to_select=n_features)
                RF.fit(X, y)
                W["surf"] = [
                    RF.feature_importances_[features.index(v)] for v in features
                ]
            elif factor == "sobol":
                from .sensapprx import SensAprx

                SF = SensAprx(
                    method="sobol",
                    domain=domain,
                    probs=probs,
                    regressor=regressor,
                    reduce=reduce,
                    num_smpl=num_smpl,
                )
                SF.fit(X, y)
                domain = SF.domain
                probs = SF.probs
                W["sobol"] = [SF.weights_[features.index(v)] for v in features]
            elif factor == "morris":
                from .sensapprx import SensAprx

                SF = SensAprx(
                    method="morris",
                    domain=domain,
                    probs=probs,
                    regressor=regressor,
                    reduce=reduce,
                    num_smpl=num_smpl,
                )
                SF.fit(X, y)
                domain = SF.domain
                probs = SF.probs
                W["morris"] = [SF.weights_[features.index(v)] for v in features]
            elif factor == "delta-mmnt":
                from .sensapprx import SensAprx

                SF = SensAprx(
                    method="delta-mmnt",
                    domain=domain,
                    probs=probs,
                    regressor=regressor,
                    reduce=reduce,
                    num_smpl=num_smpl,
                )
                SF.fit(X, y)
                domain = SF.domain
                probs = SF.probs
                W["delta_mmnt"] = [SF.weights_[features.index(v)] for v in features]
            elif factor == "info-gain":
                from sklearn.feature_selection import mutual_info_classif

                Res = mutual_info_classif(X, y, discrete_features=True)
                W["info_gain"] = [Res[features.index(v)] for v in features]
        new_w_df = DataFrame(W)
        merged = weights_df.merge(new_w_df, on="feature")
        merged.fillna(0.0)
        merged.to_sql("weights", self.conn, if_exists="replace", index=False)
Exemple #14
0
def rank_features_by_rebate_methods(data_split_list,
                                    fs_method,
                                    iterate,
                                    remove_percent=0.1,
                                    verbose=False):
    ## 0. Input arguments:
    # data_split_list: data frame that contains the learning data
    # fs_method: feature ranking methods to be used: 'SURF', 'SURFstar', 'MultiSURF', or 'MultiSURFstar'
    # iterate: whether to implement TURF: True or False (TURF will remove low-ranking features after each iteration, effective when #features is large)
    # remove_percent: percentage of features removed at each iteration (only applied when iterate = True)
    # verbose: whether to show progress by each fold: True of False

    ## 1. Define function for feature ranking method
    # SURF
    if fs_method == 'SURF':
        # Implement TURF extension when 'iterate == True'
        if iterate == True:
            fs = TuRF(core_algorithm='SURF', pct=remove_percent)
        else:
            fs = SURF()
    # SURFstar
    if fs_method == 'SURFstar':
        if iterate == True:
            fs = TuRF(core_algorithm='SURFstar', pct=remove_percent)
        else:
            fs = SURFstar()
    # MultiSURF
    if fs_method == 'MultiSURF':
        if iterate == True:
            fs = TuRF(core_algorithm='MultiSURF', pct=remove_percent)
        else:
            fs = MultiSURF()
    # MultiSURFstar
    if fs_method == 'MultiSURFstar':
        if iterate == True:
            fs = TuRF(core_algorithm='MultiSURFstar', pct=remove_percent)
        else:
            fs = MultiSURFstar()

    ## 2. Perform feature ranking on each fold of training data
    # iterate by folds
    feat_impt_dict = {}
    for i in range(0, len(data_split_list)):
        # intermediate output
        if verbose == True:
            print('Computing feature importance scores using data from fold ' +
                  str(i) + '\n')
# obtain training feature matrix and response vector
        feat_train, label_train, _, _ = data_split_list[i]
        # fit feature ranking model using the specified method
        if iterate == True:
            fs.fit(feat_train.values, label_train.values, list(feat_train))
        else:
            fs.fit(feat_train.values, label_train.values)
        # output feature importance scores in a data frame
        fold_name = 'Fold_' + str(i)
        feat_impt_dict[fold_name] = fs.feature_importances_
    # aggregate results from muliple folds into one data frame
    feat_impt_df = pd.DataFrame(feat_impt_dict)
    feat_impt_df.index = feat_train.columns

    return feat_impt_df