Esempio n. 1
0
def fval(df, y, alpha, k):
    """Feature Selection based on F-Value

    :param df: dataframe
    :param y: label
    :param alpha: hyper-parameter [alpha]
    :param k: number of select features
    :return: dataframe of feature selected
    """
    x_bin = MinMaxScaler().fit_transform(scale(df))
    select_chi2 = SelectFpr(chi2, alpha=alpha).fit(x_bin, y)
    select_f_classif = SelectFpr(f_classif, alpha=alpha).fit(df, y)

    chi2_selected = select_chi2.get_support()
    f_classif_selected = select_f_classif.get_support()
    chi2_selected_features = [
        f for i, f in enumerate(df.columns) if chi2_selected[i]
    ]
    logging.info('Chi2 selected {} features {}.'.format(
        chi2_selected.sum(), chi2_selected_features))

    f_classif_selected_features = [
        f for i, f in enumerate(df.columns) if f_classif_selected[i]
    ]
    logging.info('F_classif selected {} features {}.'.format(
        f_classif_selected.sum(), f_classif_selected_features))
    selected = chi2_selected & f_classif_selected
    logging.info('Chi2 & F_classif selected {} features'.format(
        selected.sum()))
    features = [f for f, s in zip(df.columns, selected) if s]
    logging.info(features)
    return df[features]
def test_boundary_case_ch2():
    # Test boundary case, and always aim to select 1 feature.
    X = np.array([[10, 20], [20, 20], [20, 30]])
    y = np.array([[1], [0], [0]])
    scores, pvalues = chi2(X, y)
    assert_array_almost_equal(scores, np.array([4., 0.71428571]))
    assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472]))

    filter_fdr = SelectFdr(chi2, alpha=0.1)
    filter_fdr.fit(X, y)
    support_fdr = filter_fdr.get_support()
    assert_array_equal(support_fdr, np.array([True, False]))

    filter_kbest = SelectKBest(chi2, k=1)
    filter_kbest.fit(X, y)
    support_kbest = filter_kbest.get_support()
    assert_array_equal(support_kbest, np.array([True, False]))

    filter_percentile = SelectPercentile(chi2, percentile=50)
    filter_percentile.fit(X, y)
    support_percentile = filter_percentile.get_support()
    assert_array_equal(support_percentile, np.array([True, False]))

    filter_fpr = SelectFpr(chi2, alpha=0.1)
    filter_fpr.fit(X, y)
    support_fpr = filter_fpr.get_support()
    assert_array_equal(support_fpr, np.array([True, False]))

    filter_fwe = SelectFwe(chi2, alpha=0.1)
    filter_fwe.fit(X, y)
    support_fwe = filter_fwe.get_support()
    assert_array_equal(support_fwe, np.array([True, False]))
def test_select_fpr_classif():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple classification problem
    with the fpr heuristic
    """
    X, y = make_classification(n_samples=200,
                               n_features=20,
                               n_informative=3,
                               n_redundant=2,
                               n_repeated=0,
                               n_classes=8,
                               n_clusters_per_class=1,
                               flip_y=0.0,
                               class_sep=10,
                               shuffle=False,
                               random_state=0)

    univariate_filter = SelectFpr(f_classif, alpha=0.0001)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode='fpr',
                                   param=0.0001).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
def selectionFwe(X, y, paramlist):
    k = paramlist['number _of_features']
    fwe = SelectFpr(chi2, k=k)
    Xnew = fwe.fit_transform(X, y)
    indexarr = fwe.get_support(indices=True)
    scores_arr = fwe.scores_
    return [Xnew, indexarr, scores_arr]
Esempio n. 5
0
    def fit(self, X, y, sample_weight=None):
        if self.allow_missing_ids is None:
            self.allow_missing_ids = np.zeros(X.shape[1]).astype(bool)

        if self.univariate_feature_selection:
            # univariate feature selection
            feature_selector = SelectFpr(alpha=0.05).fit(
                X[:, ~self.allow_missing_ids], y)
            self.support = np.ones(X.shape[1]).astype(bool)
            self.support[~self.
                         allow_missing_ids] = feature_selector.get_support()
            X = X[:, self.support]
        else:
            self.support = np.ones(X.shape[1]).astype(bool)

        # fit the model
        super().fit(X, y, [len(X)], sample_weight=sample_weight)

        # get the mean of z for each level of y
        self.label_encoder = LabelEncoder().fit(y)
        self.classes_ = self.label_encoder.classes_
        z = super().predict(X).astype(float)
        self.z_means = np.array(
            [z[y == cl].mean() for cl in self.label_encoder.classes_])
        return self
Esempio n. 6
0
def test_clone():
    # Tests that clone creates a correct deep copy.
    # We create an estimator, make a copy of its original state
    # (which, in this case, is the current state of the estimator),
    # and check that the obtained copy is a correct deep copy.

    from sklearn.feature_selection import SelectFpr, f_classif

    selector = SelectFpr(f_classif, alpha=0.1)
    new_selector = clone(selector)
    assert selector is not new_selector
    assert selector.get_params() == new_selector.get_params()

    selector = SelectFpr(f_classif, alpha=np.zeros((10, 2)))
    new_selector = clone(selector)
    assert selector is not new_selector
Esempio n. 7
0
def build_model(clf="log_reg",
                train_reader=sick_train_reader,
                feature_vectorizer=DictVectorizer(sparse=True),
                features=None,
                feature_selector=SelectFpr(chi2, alpha=0.05),
                file_name=None,
                load_vec=None,
                compression=None):
    ''' Builds the model of choice. '''
    global _models

    clf_pipe = None
    '''
    Putting RFE in the pipeline 
    feature_selector = RFE( LogisticRegression(solver='lbfgs'),
                             n_features_to_select = 5000,
                             step = 0.05)
    '''

    if compression:
        clf_pipe = Pipeline([('dict_vector', feature_vectorizer),
                             ('feature_selector', feature_selector),
                             ('compression', _models[compression]),
                             ('clf', _models[clf])])
    else:
        clf_pipe = Pipeline([('dict_vector', feature_vectorizer),
                             ('feature_selector', feature_selector),
                             ('clf', _models[clf])])

    feat_vec, labels = obtain_vectors(file_name, load_vec, train_reader,
                                      features)

    return clf_pipe, feat_vec, labels
Esempio n. 8
0
def get_ensemble_model(w2v=None):

    if not w2v:
        glove = Glove.load()
        w2v = glove.get_dict()

    n_jobs = -1
    return Pipeline([
        ('feature_extraction', get_features(w2v)),
        # false positive rate test for feature selection
        ('feature_selection', SelectFpr(f_classif)),
        #('normalize', Normalizer(norm='l2')),
        (
            'proba',
            ProbExtractor([
                RandomForestClassifier(n_estimators=300,
                                       max_depth=10,
                                       min_samples_split=5,
                                       n_jobs=n_jobs),
                #                                 ExtraTreesClassifier(n_estimators=300, max_depth=10,
                #                                                      min_samples_split=10,
                #                                                      n_jobs=n_jobs),
                XGBClassifier(n_estimators=300, max_depth=10, n_jobs=8),
                LogisticRegression(C=0.1,
                                   solver='lbfgs',
                                   penalty='l2',
                                   n_jobs=n_jobs),
                BernoulliNB(alpha=5.0)
            ])),
        ('polynomial', PolynomialFeatures(degree=2)),
        ('logistic_regression',
         GridSearchCV(LogisticRegression(penalty='l2', random_state=42),
                      param_grid=params))
    ])
Esempio n. 9
0
    def fit(self, X, y, sample_weight=None):
        self.label_encoder = LabelEncoder().fit(y)
        self.classes_ = self.label_encoder.classes_
        y = self.label_encoder.transform(y)

        if self.allow_missing_ids is None:
            self.allow_missing_ids = np.zeros(X.shape[1]).astype(bool)

        if self.univariate_feature_selection:
            # univariate feature selection
            feature_selector = SelectFpr(alpha=0.05).fit(
                X[:, ~self.allow_missing_ids], y)
            self.support = np.ones(X.shape[1]).astype(bool)
            self.support[~self.
                         allow_missing_ids] = feature_selector.get_support()
            X = X[:, self.support]
            if self.bounds is not None:
                self.bounds = [
                    self.bounds[ii] for ii in range(len(self.bounds))
                    if self.support[ii]
                ]
        else:
            self.support = np.ones(X.shape[1]).astype(bool)

        def func(w, X, y, alpha, sw):
            out, grad = _logistic_loss_and_grad(w, X, y, 0, sw)
            out_penalty = alpha * np.sum(np.abs(w[:-1]))
            grad_penalty = np.r_[alpha * np.sign(w[:-1]), 0]
            return out + out_penalty, grad + grad_penalty

        y2 = np.array(y)
        y2[y2 == 0] = -1
        w0 = np.r_[np.random.randn(X.shape[1]) / 10, 0.]
        if self.bounds is None:
            method = 'BFGS'
        else:
            method = 'L-BFGS-B'
        if sample_weight is None:
            if self.class_weight is not None:
                sample_weight = get_sample_weights(
                    y, class_weight=self.class_weight)
            else:
                sample_weight = np.ones(len(X))
        sample_weight /= (np.mean(sample_weight) * len(X))
        self.opt_res = minimize(func,
                                w0,
                                method=method,
                                jac=True,
                                args=(X, y2, 1. / self.C, sample_weight),
                                bounds=self.bounds + [(None, None)],
                                options={
                                    "gtol": self.tol,
                                    "maxiter": self.max_iter
                                })
        self.coef_ = np.zeros(len(self.support))
        self.coef_[self.support] = self.opt_res.x[:-1]
        self.coef_ = self.coef_.reshape(1, -1)
        self.intercept_ = self.opt_res.x[-1].reshape(1, )
        return self
Esempio n. 10
0
def SelectFpr_selector(data, target, sf):
    selector = SelectFpr(score_func=sf)
    data_new = selector.fit_transform(data.values, target.values.ravel())
    outcome = selector.get_support(True)
    new_features = []  # The list of your K best features
    for ind in outcome:
        new_features.append(data.columns.values[ind])
    return pd.DataFrame(data_new, columns=new_features)
Esempio n. 11
0
def get_feature_extractor(w2v=None):
    if not w2v:
        glove = Glove.load()
        w2v = glove.get_dict()

    return Pipeline([("feature_extraction", get_features(w2v)),
                     ('feature_selection', SelectFpr(f_classif))
                     ])
def test_verbose_output_for_select_select_fpr():
    expected_output = ("The p-value of column 'B' (1.0000) is above the " +
                       "specified alpha of 0.5000")

    model = SelectFpr(chi2, alpha=0.5)

    output = _capture_verbose_output_for_model(model, use_supervised_df=True)

    assert output == expected_output
Esempio n. 13
0
def selectFpr(args):
    """Uses scikit-learn's SelectFpr, select the pvalues below alpha based on a FPR test.
        
    Parameters
    ----------

    score_func : callable
        Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues).

    alpha : float, optional
        The highest uncorrected p-value for features to keep.

    """

    if (args[2] == "chi2"):
        selector = SelectFpr(chi2, alpha=float(args[1]))
    elif (args[2] == "f_classif"):
        selector = SelectFpr(f_classif, alpha=float(args[1]))

    return selector
def feature_SelectFpr(x_data, y_data):
    # print(x_data)
    # print(y_data)
    bestfeatures = SelectFpr(f_classif, alpha=0.01)
    fit = bestfeatures.fit(x_data, y_data)
    dfscores = pd.DataFrame(fit.scores_)
    dfcolumns = pd.DataFrame(x_data.columns)
    featureScores = pd.concat([dfcolumns, dfscores], axis=1)
    featureScores.columns = ['Specs', 'Score']  # naming the dataframe columns
    top_20_features = featureScores.nlargest(20, 'Score')
    return top_20_features
Esempio n. 15
0
def feature_method_selection(data, label, fsname):
    """
    select features by option 'fsname'
    :param data:
    :param label:
    :param fsname:
    :return: new_data, selected data
    :return: selected_features_inx, the index of selected feature, starts with 0
    """
    if fsname == 'variance_threshold': #变化不大就舍弃,离散值
        model = VarianceThreshold() #th=1
        return model.fit_transform(data)

    elif fsname == 'select_kbest':
        model = SelectKBest(chi2, k=10) #特征值必须非负,chi2是分类

    elif fsname == 'rfe':#递归消除,耗时很长
        svc = SVC(kernel='linear', C=1)
        model = RFE(estimator=svc, n_features_to_select=10, step=1)

    elif fsname == 'rfecv': #交叉验证执行执行REF,label必须是数值
        svc = SVC(kernel="linear")
        rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(label, 1),
                      scoring='accuracy')

    elif fsname == 'RandLasso':#打乱重新选择,cannot perform reduce with flexible type
        model = RandomizedLogisticRegression()


    elif fsname == 'linear_svc':
        model = LinearSVC() #没有importance

    elif fsname == 'tree':
        model = ExtraTreesClassifier()

    elif fsname == 'fclassif':
        model = SelectFpr() #默认是f_classif,值越大,特征越有用

    elif fsname == 'pearsonr': #label必须是数值
        label = turn_label_2num(label)#结果是两个sample的相关性
        res = pearsonr(data,label)

    elif fsname == 'RandForReg': #label必须是数值
        label = turn_label_2num(label)
        model = RandomForestRegressor()

    else:
        logging.error('ERROR: feature selection option is wrong')

    model.fit(data, label)
    new_data = model.transform(data)  # selected importanted data

    return new_data
Esempio n. 16
0
    def fit(self, X, y, sample_weight=None):
        self.fitted_ = False
        if self.allow_missing_ids is None:
            self.allow_missing_ids = np.zeros(X.shape[1]).astype(bool)

        Xold = np.array(X)
        if self.univariate_feature_selection:
            # univariate feature selection
            feature_selector = SelectFpr(alpha=0.05).fit(
                X[:, ~self.allow_missing_ids], y)
            self.support = np.ones(X.shape[1]).astype(bool)
            self.support[~self.
                         allow_missing_ids] = feature_selector.get_support()
            X = X[:, self.support]
            self.allow_missing_ids = self.allow_missing_ids[self.support]
        else:
            self.support = np.ones(X.shape[1]).astype(bool)

        if sample_weight is None:
            if self.class_weight is not None:
                sample_weight = get_sample_weights(
                    y, class_weight=self.class_weight)
            else:
                sample_weight = np.ones(len(X))
        sample_weight /= (np.mean(sample_weight) * len(X))

        # generate pairs
        X2, y2, sw2 = self._generate_pairs(X, y, sample_weight)
        sw2 = sw2 / sw2.mean()
        if self.verbose:
            print('Generated %d pairs from %d samples' % (len(X2), len(X)))

        # fit the model
        if self.estimator.bounds is not None:
            self.estimator.bounds = [
                self.estimator.bounds[ii]
                for ii in range(len(self.estimator.bounds)) if self.support[ii]
            ]
        self.estimator.fit(X2, y2, sample_weight=sw2)

        # get the mean of z for each level of y
        self.label_encoder = LabelEncoder().fit(y)
        self.classes_ = self.label_encoder.classes_
        z = self.predict_z(Xold)
        self.z_means = np.array(
            [z[y == cl].mean() for cl in self.label_encoder.classes_])

        self.coef_ = np.zeros(len(self.support))
        self.coef_[self.support] = self.estimator.coef_.flatten()
        self.coef_ = self.coef_.reshape(1, -1)
        self.intercept_ = self.estimator.intercept_
        self.fitted_ = True
        return self
Esempio n. 17
0
def test_clone_2():
    # Tests that clone doesn't copy everything.
    # We first create an estimator, give it an own attribute, and
    # make a copy of its original state. Then we check that the copy doesn't
    # have the specific attribute we manually added to the initial estimator.

    from sklearn.feature_selection import SelectFpr, f_classif

    selector = SelectFpr(f_classif, alpha=0.1)
    selector.own_attribute = "test"
    new_selector = clone(selector)
    assert not hasattr(new_selector, "own_attribute")
Esempio n. 18
0
def train_DT(
    feats=None,
    labels=[],
    feature_selector=SelectFpr(
        chi2, alpha=0.05),  # Use None to stop feature selection
    cv=5):  # Number of folds used in cross-validation
    # Map the count dictionaries to a sparse feature matrix:
    vectorizer = DictVectorizer(sparse=False)
    feats = vectorizer.fit_transform(feats)
    ##### FEATURE SELECTION
    feat_matrix = feats
    feature_selector = RFE(estimator=MultinomialNB(),
                           n_features_to_select=None,
                           step=1,
                           verbose=0)
    feat_matrix = feature_selector.fit_transform(feats, labels)

    ##### HYPER-PARAMETER SEARCH
    # Define the basic model to use for parameter search:
    searchmod = DecisionTreeClassifier()
    # Parameters to grid-search over:
    parameters = {
        'splitter': ['best', 'random'],
        'max_features': ['sqrt', 0.25, 'log2'],
        'min_samples_split': [2, 5, 10]
    }
    # Cross-validation grid search to find the best hyper-parameters:
    clf = GridSearchCV(searchmod, parameters, cv=cv, n_jobs=-1)
    clf.fit(feat_matrix, labels)
    params = clf.best_params_

    # Establish the model we want using the parameters obtained from the search:
    mod = DecisionTreeClassifier(splitter=params['splitter'],
                                 max_features=params['max_features'],
                                 min_samples_split=params['min_samples_split'])
    ##### ASSESSMENT
    scores = cross_val_score(mod,
                             feat_matrix,
                             labels,
                             cv=cv,
                             scoring="f1_macro")
    print 'Best model', mod
    print '%s features selected out of %s total' % (feat_matrix.shape[1],
                                                    feats.shape[1])
    print 'F1 mean: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2)

    # TRAIN OUR MODEL:
    mod.fit(feat_matrix, labels)
    # Return the trained model along with the objects we need to
    # featurize test data in a way that aligns with our training
    # matrix:
    return (mod, vectorizer, feature_selector)
Esempio n. 19
0
def correlation(df, y, threshold, alpha, corr_k_pass, mode):
    """Feature selection based on correlation between features

    :param df: dataframe
    :param y: label
    :param threshold: select feature threshold
    :param alpha: hyper-parameter [alpha]
    :param corr_k_pass: correlation threshold
    :param mode: feature selection based on static method
    :return: dataframe of feature selected
    """
    df_out = df.corr()
    col_pass = []
    del_col = []
    if mode == "chi2":
        filter_slect = chi2
    elif mode == "f":
        filter_slect = f_classif
    else:
        raise Exception("No mode: " % mode)
    if alpha:
        x_bin = MinMaxScaler().fit_transform(scale(df))
        fpval = SelectFpr(filter_slect, alpha=alpha).fit(x_bin, y).scores_
        df_sort_fval = pd.DataFrame({
            "col": list(df.columns),
            "fval": list(fpval)
        })
        df_sort_fval = df_sort_fval.sort_values(by=['fval'], ascending=False)
        ranking_col = list(df_sort_fval['col'])
    else:
        ranking_col = list(df.columns)

    for i, col in enumerate(ranking_col):
        if col not in del_col:
            col_pass.append(col)
            del_col = list(
                set(del_col +
                    (list(df_out[col][(df_out[col] > threshold)
                                      | (df_out[col] < -threshold)].index))))
        else:
            del_col = list(
                set(del_col +
                    (list(df_out[col][(df_out[col] > threshold)
                                      | (df_out[col] < -threshold)].index))))
    del df_out

    logging.info("Del col : %d" % len(del_col))
    logging.info("Passed col : %d" % len(col_pass))
    if corr_k_pass:
        if len(col_pass) > corr_k_pass:
            col_pass = col_pass[:corr_k_pass]
    return df[col_pass]
Esempio n. 20
0
def select_fpr(args):
    #https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFpr.html

    from sklearn.feature_selection import f_classif, chi2

    if args['alpha'] is None:
        args['alpha'] = 0.05

    if args['score_function'] == 'chi2':
        args['score_function'] = chi2
    elif args['score_function'] == 'f_classif':
        args['score_function'] = f_classif

    return SelectFpr(score_func=args['score_function'], alpha=args['alpha'])
Esempio n. 21
0
def test_clone():
    """Tests that clone creates a correct deep copy.

    We create an estimator, make a copy of its original state
    (which, in this case, is the current state of the setimator),
    and check that the obtained copy is a correct deep copy.

    """
    from sklearn.feature_selection import SelectFpr, f_classif

    selector = SelectFpr(f_classif, alpha=0.1)
    new_selector = clone(selector)
    assert_true(selector is not new_selector)
    assert_equal(selector._get_params(), new_selector._get_params())
Esempio n. 22
0
def get_basic_model(model, w2v=None):
    if not w2v:
        glove = Glove.load()
        w2v = glove.get_dict()

    n_jobs = -1
    return Pipeline([
        ('feature_extraction', get_features(w2v)),
        # false positive rate test for feature selection
        ('feature_selection', SelectFpr(f_classif)),
        #('normalize', StandardScaler(with_mean=False)),
        #('normalize', MaxAbsScaler()),
        ("model", model)
    ])
Esempio n. 23
0
def get_fsmethod (fsmethod, n_feats, n_subjs, n_jobs=1):

    if fsmethod == 'stats':
        return 'stats', None

    #Feature selection procedures
                                #http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html
    fsmethods = { 'rfe'       : RFE(estimator=SVC(kernel="linear"), step=0.05, n_features_to_select=2),
                                #http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html
                  'rfecv'     : RFECV(estimator=SVC(kernel="linear"), step=0.05, loss_func=zero_one), #cv=3, default; cv=StratifiedKFold(n_subjs, 3)
                                #Univariate Feature selection: http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectPercentile.html
                  'univariate': SelectPercentile(f_classif, percentile=5),
                                #http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFpr.html
                  'fpr'       : SelectFpr (f_classif, alpha=0.05),
                                #http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFdr.html
                  'fdr'       : SelectFdr (f_classif, alpha=0.05),
                                #http://scikit-learn.org/stable/modules/feature_selection.html
                  'extratrees': ExtraTreesClassifier(n_estimators=50, max_features='auto', compute_importances=True, n_jobs=n_jobs, random_state=0),

                  'pca'       : PCA(n_components='mle'),
                  'rpca'      : RandomizedPCA(random_state=0),
                  'lda'       : LDA(),
    }

    #feature selection parameter values for grid search
    max_feats = ['auto']
    if n_feats < 10:
        feats_to_sel = range(2, n_feats, 2)
        n_comps = range(1, n_feats, 2)
    else:
        feats_to_sel = range(2, 20, 4)
        n_comps = range(1, 30, 4)
    max_feats.extend(feats_to_sel)

    n_comps_pca = list(n_comps)
    n_comps_pca.extend(['mle'])

    fsgrid =    { 'rfe'       : dict(estimator_params = [dict(C=0.1), dict(C=1), dict(C=10)], n_features_to_select = feats_to_sel),
                  'rfecv'     : dict(estimator_params = [dict(C=0.1), dict(C=1), dict(C=10)]),
                  'univariate': dict(percentile = [1, 3, 5, 10]),
                  'fpr'       : dict(alpha = [1, 3, 5, 10]),
                  'fdr'       : dict(alpha = [1, 3, 5, 10]),
                  'extratrees': dict(n_estimators = [1, 3, 5, 10, 30, 50], max_features = max_feats),
                  'pca'       : dict(n_components = n_comps_pca, whiten = [True, False]),
                  'rpca'      : dict(n_components = n_comps, iterated_power = [3, 4, 5], whiten = [True, False]),
                  'lda'       : dict(n_components = n_comps)
    }

    return fsmethods[fsmethod], fsgrid[fsmethod]
Esempio n. 24
0
 def test_select_fpr_int(self):
     model = SelectFpr()
     X = np.array(
         [[1, 2, 3, 1], [0, 3, 1, 4], [3, 5, 6, 1], [1, 2, 1, 5]],
         dtype=np.int64)
     y = np.array([0, 1, 0, 1])
     model.fit(X, y)
     model_onnx = convert_sklearn(
         model, "select fpr",
         [("input", Int64TensorType([None, X.shape[1]]))],
         target_opset=TARGET_OPSET)
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         X, model, model_onnx,
         basename="SklearnSelectFpr")
Esempio n. 25
0
 def test_select_fpr_int(self):
     model = SelectFpr()
     X = np.array([[1, 2, 3, 1], [0, 3, 1, 4], [3, 5, 6, 1], [1, 2, 1, 5]])
     y = np.array([0, 1, 0, 1])
     model.fit(X, y)
     model_onnx = convert_sklearn(
         model, 'select fpr', [('input', Int64TensorType([1, X.shape[1]]))])
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         X,
         model,
         model_onnx,
         basename="SklearnSelectFpr",
         allow_failure=
         "StrictVersion(onnxruntime.__version__) <= StrictVersion('0.1.4')")
Esempio n. 26
0
def train_decisiontree_FPR(configurationname,
                           train_data,
                           score_function,
                           undersam=False,
                           oversam=False,
                           export=False):
    print("Training with configuration " + configurationname)
    X_train, y_train, id_to_a_train = train_data
    dtc = DecisionTreeClassifier(random_state=0)

    print("Feature Selection")
    # selector = SelectFpr(score_function)
    selector = SelectFpr(score_function)
    result = selector.fit(X_train, y_train)
    X_train = selector.transform(X_train)

    fitted_ids = [i for i in result.get_support(indices=True)]

    print("Apply Resampling")
    print(Counter(y_train))
    if undersam and not oversam:
        renn = RepeatedEditedNearestNeighbours()
        X_train, y_train = renn.fit_resample(X_train, y_train)
    if oversam and not undersam:
        # feature_indices_array = list(range(len(f_to_id)))
        # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0)
        # X_train, y_train = smote_nc.fit_resample(X_train, y_train)
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)
    if oversam and undersam:
        smote_enn = SMOTEENN(random_state=0)
        X_train, y_train = smote_enn.fit_resample(X_train, y_train)
    print(Counter(y_train))

    print("Train Classifier")
    dtc = dtc.fit(X_train, y_train, check_input=True)

    # if export:
    print("Exporting decision tree image...")
    export_graphviz(dtc,
                    out_file=DATAP + "/temp/trees/sltree_" +
                    configurationname + ".dot",
                    filled=True)
    transform(fitted_ids)

    print("Self Accuracy: " + str(dtc.score(X_train, y_train)))

    return selector, dtc
Esempio n. 27
0
def select_features(data,
                    features,
                    target,
                    feature_selector='SelectKBest',
                    k=10,
                    alpha=0.05,
                    score_func='f_classif'):
    X = data[features]
    y = data[target]

    if score_func == 'f_classif':
        score_func = f_classif
    elif score_func == 'f_regression':
        score_func = f_regression
    elif score_func == 'chi2':
        score_func = chi2
    elif score_func == 'mutual_info_classif':
        score_func = mutual_info_classif
    elif score_func == 'mutual_info_regression':
        score_func = mutual_info_regression
    else:
        raise Exception('Undefined score_func')

    if feature_selector == 'SelectKBest':
        feature_selector = SelectKBest(score_func=score_func, k=k)
    elif feature_selector == 'SelectFpr':
        feature_selector = SelectFpr(score_func=score_func, alpha=alpha)
    elif feature_selector == 'SelectFdr':
        feature_selector = SelectFdr(score_func=score_func, alpha=alpha)
    else:
        raise Exception('Undefined score_func')

    feature_selector.fit_transform(X, y)
    feature_index = [
        zero_based_index
        for zero_based_index in list(feature_selector.get_support(
            indices=True))
    ]

    best_features = []
    for i in feature_index:
        best_features.append(features[i])

    print('Best features selected are: ' + str(best_features))

    return best_features
def feature_Univarselection(data, y, Alpha):
    xx = data.sort_values('pid').values
    xx_label = y.sort_values('pid')[sep].values
    select = SelectFpr(f_classif, alpha=Alpha).fit(xx, xx_label)
    # select = SelectFdr(f_classif, alpha=Alpha).fit(xx,xx_label)
    # select = SelectFwe(f_classif, alpha=Alpha).fit(xx,xx_label)
    # select = SelectKBest(chi2, k=num_feature).fit(xx,xx_label)
    # select = SelectFromModel(estimator=Lasso(), threshold=-np.inf, max_features=num_feature).fit(data,y)
    reduced_xx = select.transform(xx)
    new_data = select.inverse_transform(reduced_xx)
    new_data = pd.DataFrame(new_data,
                            index=data.sort_values('pid').index,
                            columns=data.sort_values('pid').columns)
    # idx = select.get_support()
    # print(idx)
    # new_data = np.delete(new_data,idx,1)
    return new_data
Esempio n. 29
0
def get_best_estimator(x_train, y_train, x_test, y_priors=None):
    pipeline = Pipeline([('selection', SelectFpr(SELECTOR)),
                         ('scaler', StandardScaler()), ('svm', svm.SVC())])
    sample_weight = None
    if y_priors is not None:
        sample_weight = [1.0 for i in xrange(len(y_train))]
        y_train.extend(y_priors)
        x_train = np.vstack((x_train, x_test))
        sample_weight.extend([PRIOR_WEIGHT for i in xrange(len(y_priors))])
        clf = GridSearchCV(pipeline,
                           params,
                           fit_params={'svm__sample_weight': sample_weight})
    else:
        clf = GridSearchCV(pipeline, params)
    clf.fit(x_train, y_train)
    clf = clf.best_estimator_
    logging.debug(clf)
    return clf
Esempio n. 30
0
def test_select_heuristics_regression():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple regression problem
    # with the fpr, fdr or fwe heuristics
    X, y = make_regression(n_samples=200, n_features=20, n_informative=5,
                           shuffle=False, random_state=0, noise=10)

    univariate_filter = SelectFpr(f_regression, alpha=0.01)
    X_r = univariate_filter.fit(X, y).transform(X)
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    for mode in ['fdr', 'fpr', 'fwe']:
        X_r2 = GenericUnivariateSelect(
            f_regression, mode=mode, param=0.01).fit(X, y).transform(X)
        assert_array_equal(X_r, X_r2)
        support = univariate_filter.get_support()
        assert_array_equal(support[:5], np.ones((5, ), dtype=np.bool))
        assert_less(np.sum(support[5:] == 1), 3)