Ejemplo n.º 1
0
class f_regressionFPRPrim(primitive):
    def __init__(self, random_state=0):
        super(f_regressionFPRPrim, self).__init__(name='f_regressionFPR')
        self.id = 29
        self.PCA_LAPACK_Prim = []
        self.type = 'feature selection'
        self.description = "Filter: Select the pvalues below alpha based on a FPR test with F-value between label/feature for regression tasks. FPR test stands for False Positive Rate test. It controls the total amount of false detections."
        self.hyperparams_run = {'default': True}
        self.selector = None
        self.accept_type = 'c_r'

    def can_accept(self, data):
        return self.can_accept_c(data, 'Regression')

    def is_needed(self, data):
        if data['X'].shape[1] < 3:
            return False
        return True

    def fit(self, data):
        data = handle_data(data)
        self.selector = SelectFpr(f_regression)
        self.selector.fit(data['X'], data['Y'])

    def produce(self, data):
        output = handle_data(data)
        cols = list(output['X'].columns)
        mask = self.selector.get_support(indices=False)
        final_cols = list(compress(cols, mask))
        output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols)
        final_output = {0: output}
        return final_output
Ejemplo n.º 2
0
def test_boundary_case_ch2():
    # Test boundary case, and always aim to select 1 feature.
    X = np.array([[10, 20], [20, 20], [20, 30]])
    y = np.array([[1], [0], [0]])
    scores, pvalues = chi2(X, y)
    assert_array_almost_equal(scores, np.array([4.0, 0.71428571]))
    assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472]))

    filter_fdr = SelectFdr(chi2, alpha=0.1)
    filter_fdr.fit(X, y)
    support_fdr = filter_fdr.get_support()
    assert_array_equal(support_fdr, np.array([True, False]))

    filter_kbest = SelectKBest(chi2, k=1)
    filter_kbest.fit(X, y)
    support_kbest = filter_kbest.get_support()
    assert_array_equal(support_kbest, np.array([True, False]))

    filter_percentile = SelectPercentile(chi2, percentile=50)
    filter_percentile.fit(X, y)
    support_percentile = filter_percentile.get_support()
    assert_array_equal(support_percentile, np.array([True, False]))

    filter_fpr = SelectFpr(chi2, alpha=0.1)
    filter_fpr.fit(X, y)
    support_fpr = filter_fpr.get_support()
    assert_array_equal(support_fpr, np.array([True, False]))

    filter_fwe = SelectFwe(chi2, alpha=0.1)
    filter_fwe.fit(X, y)
    support_fwe = filter_fwe.get_support()
    assert_array_equal(support_fwe, np.array([True, False]))
def selectionFwe(X, y, paramlist):
    k = paramlist['number _of_features']
    fwe = SelectFpr(chi2, k=k)
    Xnew = fwe.fit_transform(X, y)
    indexarr = fwe.get_support(indices=True)
    scores_arr = fwe.scores_
    return [Xnew, indexarr, scores_arr]
def test_select_fpr_classif():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple classification problem
    with the fpr heuristic
    """
    X, Y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )

    univariate_filter = SelectFpr(f_classif, alpha=0.0001)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode="fpr", param=0.0001).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
Ejemplo n.º 5
0
def evaluate_model(classifier, data_records, class_labels, labels):

    attribute_values = []
    accuracy_values = []

    # Scoring the attributes using F_test and false positive rate
    clf = SelectFpr(f_classif, alpha=0.9)
    clf.fit(data_records, class_labels)
    print(clf.scores_)
    print('\n')

    ranked_attr_indices = [0] * len(clf.scores_)
    for i, x in enumerate(sorted(range(len(clf.scores_)), key=lambda y: clf.scores_[y])):
        ranked_attr_indices[x] = i

    # Performing a 4-fold cross validation against varying number of attributes. The attributes are chosen
    # on the basis of their scores
    for idx in range(2, len(ranked_attr_indices)):
        filtered_records = data_records[:, ranked_attr_indices[:idx]]
        for idx2 in ranked_attr_indices[:idx]:
            print(labels[idx2])
        validation_score = cross_validation.cross_val_score(classifier, filtered_records, class_labels, cv=5)
        accuracy = max(validation_score) * 100
        attribute_values.append(idx)
        accuracy_values.append(accuracy)
        print('Cross validation score - ' + str(idx) + ' attributes :' + str(validation_score) + '\n')

    return (attribute_values, accuracy_values)
Ejemplo n.º 6
0
def test_select_fpr_classif():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple classification problem
    with the fpr heuristic
    """
    X, y = make_classification(n_samples=200,
                               n_features=20,
                               n_informative=3,
                               n_redundant=2,
                               n_repeated=0,
                               n_classes=8,
                               n_clusters_per_class=1,
                               flip_y=0.0,
                               class_sep=10,
                               shuffle=False,
                               random_state=0)

    univariate_filter = SelectFpr(f_classif, alpha=0.0001)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode='fpr',
                                   param=0.0001).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
Ejemplo n.º 7
0
    def fit(self, X, y, sample_weight=None):
        if self.allow_missing_ids is None:
            self.allow_missing_ids = np.zeros(X.shape[1]).astype(bool)

        if self.univariate_feature_selection:
            # univariate feature selection
            feature_selector = SelectFpr(alpha=0.05).fit(
                X[:, ~self.allow_missing_ids], y)
            self.support = np.ones(X.shape[1]).astype(bool)
            self.support[~self.
                         allow_missing_ids] = feature_selector.get_support()
            X = X[:, self.support]
        else:
            self.support = np.ones(X.shape[1]).astype(bool)

        # fit the model
        super().fit(X, y, [len(X)], sample_weight=sample_weight)

        # get the mean of z for each level of y
        self.label_encoder = LabelEncoder().fit(y)
        self.classes_ = self.label_encoder.classes_
        z = super().predict(X).astype(float)
        self.z_means = np.array(
            [z[y == cl].mean() for cl in self.label_encoder.classes_])
        return self
def test_boundary_case_ch2():
    # Test boundary case, and always aim to select 1 feature.
    X = np.array([[10, 20], [20, 20], [20, 30]])
    y = np.array([[1], [0], [0]])
    scores, pvalues = chi2(X, y)
    assert_array_almost_equal(scores, np.array([4., 0.71428571]))
    assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472]))

    filter_fdr = SelectFdr(chi2, alpha=0.1)
    filter_fdr.fit(X, y)
    support_fdr = filter_fdr.get_support()
    assert_array_equal(support_fdr, np.array([True, False]))

    filter_kbest = SelectKBest(chi2, k=1)
    filter_kbest.fit(X, y)
    support_kbest = filter_kbest.get_support()
    assert_array_equal(support_kbest, np.array([True, False]))

    filter_percentile = SelectPercentile(chi2, percentile=50)
    filter_percentile.fit(X, y)
    support_percentile = filter_percentile.get_support()
    assert_array_equal(support_percentile, np.array([True, False]))

    filter_fpr = SelectFpr(chi2, alpha=0.1)
    filter_fpr.fit(X, y)
    support_fpr = filter_fpr.get_support()
    assert_array_equal(support_fpr, np.array([True, False]))

    filter_fwe = SelectFwe(chi2, alpha=0.1)
    filter_fwe.fit(X, y)
    support_fwe = filter_fwe.get_support()
    assert_array_equal(support_fwe, np.array([True, False]))
def select_with_fpr(train, test):
  train_data = train.drop('ID', axis=1)
  test_data = test.drop('ID', axis=1)

  train_y = train_data['TARGET']
  train_X = train_data.drop('TARGET', 1)

  fpr = SelectFpr(alpha = 0.001)

  features = fpr.fit_transform(train_X, train_y)

  print('Fpr выбрал {} признаков.'.format(features.shape[1]))

  col_numbers = fpr.get_support()
  columns = np.delete(train_data.columns.values, train_data.shape[1] - 1, axis=0)
  features = []
  i = 0
  for i in range(len(columns)):
    if col_numbers[i] == True:
      features.append(columns[i])

  new_train = train[['ID'] + features + ['TARGET']]
  new_train.to_csv('train_after_fpr.csv')

  new_test = test[['ID'] + features]
  new_test.to_csv('test_after_fpr.csv')
Ejemplo n.º 10
0
class UnivariateSelectChiFPRPrim(primitive):
    def __init__(self, random_state=0):
        super(UnivariateSelectChiFPRPrim, self).__init__(name='UnivariateSelectChiFPR')
        self.id = 27
        self.PCA_LAPACK_Prim = []
        self.type = 'feature selection'
        self.description = "Filter: Select the pvalues below alpha based on a FPR test with Chi-square. FPR test stands for False Positive Rate test. It controls the total amount of false detections."
        self.hyperparams_run = {'default': True}
        self.selector = None
        self.accept_type = 'd'

    def can_accept(self, data):
        return self.can_accept_d(data, 'Classification')

    def is_needed(self, data):
        if data['X'].shape[1] < 3:
            return False
        return True

    def fit(self, data):
        data = handle_data(data)
        self.selector = SelectFpr(chi2, alpha=0.05)
        self.selector.fit(data['X'], data['Y'])

    def produce(self, data):
        output = handle_data(data)
        cols = list(output['X'].columns)
        try:
            mask = self.selector.get_support(indices=False)
            final_cols = list(compress(cols, mask))
            output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols)
        except Exception as e:
            print(e)
        final_output = {0: output}
        return final_output
Ejemplo n.º 11
0
    def fit(self, X, y, sample_weight=None):
        self.label_encoder = LabelEncoder().fit(y)
        self.classes_ = self.label_encoder.classes_
        y = self.label_encoder.transform(y)

        if self.allow_missing_ids is None:
            self.allow_missing_ids = np.zeros(X.shape[1]).astype(bool)

        if self.univariate_feature_selection:
            # univariate feature selection
            feature_selector = SelectFpr(alpha=0.05).fit(
                X[:, ~self.allow_missing_ids], y)
            self.support = np.ones(X.shape[1]).astype(bool)
            self.support[~self.
                         allow_missing_ids] = feature_selector.get_support()
            X = X[:, self.support]
            if self.bounds is not None:
                self.bounds = [
                    self.bounds[ii] for ii in range(len(self.bounds))
                    if self.support[ii]
                ]
        else:
            self.support = np.ones(X.shape[1]).astype(bool)

        def func(w, X, y, alpha, sw):
            out, grad = _logistic_loss_and_grad(w, X, y, 0, sw)
            out_penalty = alpha * np.sum(np.abs(w[:-1]))
            grad_penalty = np.r_[alpha * np.sign(w[:-1]), 0]
            return out + out_penalty, grad + grad_penalty

        y2 = np.array(y)
        y2[y2 == 0] = -1
        w0 = np.r_[np.random.randn(X.shape[1]) / 10, 0.]
        if self.bounds is None:
            method = 'BFGS'
        else:
            method = 'L-BFGS-B'
        if sample_weight is None:
            if self.class_weight is not None:
                sample_weight = get_sample_weights(
                    y, class_weight=self.class_weight)
            else:
                sample_weight = np.ones(len(X))
        sample_weight /= (np.mean(sample_weight) * len(X))
        self.opt_res = minimize(func,
                                w0,
                                method=method,
                                jac=True,
                                args=(X, y2, 1. / self.C, sample_weight),
                                bounds=self.bounds + [(None, None)],
                                options={
                                    "gtol": self.tol,
                                    "maxiter": self.max_iter
                                })
        self.coef_ = np.zeros(len(self.support))
        self.coef_[self.support] = self.opt_res.x[:-1]
        self.coef_ = self.coef_.reshape(1, -1)
        self.intercept_ = self.opt_res.x[-1].reshape(1, )
        return self
Ejemplo n.º 12
0
def SelectFpr_selector(data, target, sf):
    selector = SelectFpr(score_func=sf)
    data_new = selector.fit_transform(data.values, target.values.ravel())
    outcome = selector.get_support(True)
    new_features = []  # The list of your K best features
    for ind in outcome:
        new_features.append(data.columns.values[ind])
    return pd.DataFrame(data_new, columns=new_features)
def feature_SelectFpr(x_data, y_data):
    # print(x_data)
    # print(y_data)
    bestfeatures = SelectFpr(f_classif, alpha=0.01)
    fit = bestfeatures.fit(x_data, y_data)
    dfscores = pd.DataFrame(fit.scores_)
    dfcolumns = pd.DataFrame(x_data.columns)
    featureScores = pd.concat([dfcolumns, dfscores], axis=1)
    featureScores.columns = ['Specs', 'Score']  # naming the dataframe columns
    top_20_features = featureScores.nlargest(20, 'Score')
    return top_20_features
Ejemplo n.º 14
0
    def fit(self, X, y, sample_weight=None):
        self.fitted_ = False
        if self.allow_missing_ids is None:
            self.allow_missing_ids = np.zeros(X.shape[1]).astype(bool)

        Xold = np.array(X)
        if self.univariate_feature_selection:
            # univariate feature selection
            feature_selector = SelectFpr(alpha=0.05).fit(
                X[:, ~self.allow_missing_ids], y)
            self.support = np.ones(X.shape[1]).astype(bool)
            self.support[~self.
                         allow_missing_ids] = feature_selector.get_support()
            X = X[:, self.support]
            self.allow_missing_ids = self.allow_missing_ids[self.support]
        else:
            self.support = np.ones(X.shape[1]).astype(bool)

        if sample_weight is None:
            if self.class_weight is not None:
                sample_weight = get_sample_weights(
                    y, class_weight=self.class_weight)
            else:
                sample_weight = np.ones(len(X))
        sample_weight /= (np.mean(sample_weight) * len(X))

        # generate pairs
        X2, y2, sw2 = self._generate_pairs(X, y, sample_weight)
        sw2 = sw2 / sw2.mean()
        if self.verbose:
            print('Generated %d pairs from %d samples' % (len(X2), len(X)))

        # fit the model
        if self.estimator.bounds is not None:
            self.estimator.bounds = [
                self.estimator.bounds[ii]
                for ii in range(len(self.estimator.bounds)) if self.support[ii]
            ]
        self.estimator.fit(X2, y2, sample_weight=sw2)

        # get the mean of z for each level of y
        self.label_encoder = LabelEncoder().fit(y)
        self.classes_ = self.label_encoder.classes_
        z = self.predict_z(Xold)
        self.z_means = np.array(
            [z[y == cl].mean() for cl in self.label_encoder.classes_])

        self.coef_ = np.zeros(len(self.support))
        self.coef_[self.support] = self.estimator.coef_.flatten()
        self.coef_ = self.coef_.reshape(1, -1)
        self.intercept_ = self.estimator.intercept_
        self.fitted_ = True
        return self
Ejemplo n.º 15
0
def test_clone_2():
    # Tests that clone doesn't copy everything.
    # We first create an estimator, give it an own attribute, and
    # make a copy of its original state. Then we check that the copy doesn't
    # have the specific attribute we manually added to the initial estimator.

    from sklearn.feature_selection import SelectFpr, f_classif

    selector = SelectFpr(f_classif, alpha=0.1)
    selector.own_attribute = "test"
    new_selector = clone(selector)
    assert_false(hasattr(new_selector, "own_attribute"))
Ejemplo n.º 16
0
def test_clone_2():
    # Tests that clone doesn't copy everything.
    # We first create an estimator, give it an own attribute, and
    # make a copy of its original state. Then we check that the copy doesn't
    # have the specific attribute we manually added to the initial estimator.

    from sklearn.feature_selection import SelectFpr, f_classif

    selector = SelectFpr(f_classif, alpha=0.1)
    selector.own_attribute = "test"
    new_selector = clone(selector)
    assert not hasattr(new_selector, "own_attribute")
Ejemplo n.º 17
0
def test_clone():
    """Tests that clone creates a correct deep copy.

    We create an estimator, make a copy of its original state
    (which, in this case, is the current state of the setimator),
    and check that the obtained copy is a correct deep copy.

    """
    from sklearn.feature_selection import SelectFpr, f_classif

    selector = SelectFpr(f_classif, alpha=0.1)
    new_selector = clone(selector)
    assert_true(selector is not new_selector)
    assert_equal(selector._get_params(), new_selector._get_params())
Ejemplo n.º 18
0
def test_clone():
    """Tests that clone creates a correct deep copy.

    We create an estimator, make a copy of its original state
    (which, in this case, is the current state of the setimator),
    and check that the obtained copy is a correct deep copy.

    """
    from sklearn.feature_selection import SelectFpr, f_classif

    selector = SelectFpr(f_classif, alpha=0.1)
    new_selector = clone(selector)
    assert_true(selector is not new_selector)
    assert_equal(selector.get_params(), new_selector.get_params())
Ejemplo n.º 19
0
 def test_select_fpr_int(self):
     model = SelectFpr()
     X = np.array(
         [[1, 2, 3, 1], [0, 3, 1, 4], [3, 5, 6, 1], [1, 2, 1, 5]],
         dtype=np.int64)
     y = np.array([0, 1, 0, 1])
     model.fit(X, y)
     model_onnx = convert_sklearn(
         model, "select fpr",
         [("input", Int64TensorType([None, X.shape[1]]))],
         target_opset=TARGET_OPSET)
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         X, model, model_onnx,
         basename="SklearnSelectFpr")
Ejemplo n.º 20
0
 def test_select_fpr_int(self):
     model = SelectFpr()
     X = np.array([[1, 2, 3, 1], [0, 3, 1, 4], [3, 5, 6, 1], [1, 2, 1, 5]])
     y = np.array([0, 1, 0, 1])
     model.fit(X, y)
     model_onnx = convert_sklearn(
         model, 'select fpr', [('input', Int64TensorType([1, X.shape[1]]))])
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         X,
         model,
         model_onnx,
         basename="SklearnSelectFpr",
         allow_failure=
         "StrictVersion(onnxruntime.__version__) <= StrictVersion('0.1.4')")
Ejemplo n.º 21
0
def train_decisiontree_FPR(configurationname,
                           train_data,
                           score_function,
                           undersam=False,
                           oversam=False,
                           export=False):
    print("Training with configuration " + configurationname)
    X_train, y_train, id_to_a_train = train_data
    dtc = DecisionTreeClassifier(random_state=0)

    print("Feature Selection")
    # selector = SelectFpr(score_function)
    selector = SelectFpr(score_function)
    result = selector.fit(X_train, y_train)
    X_train = selector.transform(X_train)

    fitted_ids = [i for i in result.get_support(indices=True)]

    print("Apply Resampling")
    print(Counter(y_train))
    if undersam and not oversam:
        renn = RepeatedEditedNearestNeighbours()
        X_train, y_train = renn.fit_resample(X_train, y_train)
    if oversam and not undersam:
        # feature_indices_array = list(range(len(f_to_id)))
        # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0)
        # X_train, y_train = smote_nc.fit_resample(X_train, y_train)
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)
    if oversam and undersam:
        smote_enn = SMOTEENN(random_state=0)
        X_train, y_train = smote_enn.fit_resample(X_train, y_train)
    print(Counter(y_train))

    print("Train Classifier")
    dtc = dtc.fit(X_train, y_train, check_input=True)

    # if export:
    print("Exporting decision tree image...")
    export_graphviz(dtc,
                    out_file=DATAP + "/temp/trees/sltree_" +
                    configurationname + ".dot",
                    filled=True)
    transform(fitted_ids)

    print("Self Accuracy: " + str(dtc.score(X_train, y_train)))

    return selector, dtc
Ejemplo n.º 22
0
def build_model(clf="log_reg",
                train_reader=sick_train_reader,
                feature_vectorizer=DictVectorizer(sparse=True),
                features=None,
                feature_selector=SelectFpr(chi2, alpha=0.05),
                file_name=None,
                load_vec=None,
                compression=None):
    ''' Builds the model of choice. '''
    global _models

    clf_pipe = None
    '''
    Putting RFE in the pipeline 
    feature_selector = RFE( LogisticRegression(solver='lbfgs'),
                             n_features_to_select = 5000,
                             step = 0.05)
    '''

    if compression:
        clf_pipe = Pipeline([('dict_vector', feature_vectorizer),
                             ('feature_selector', feature_selector),
                             ('compression', _models[compression]),
                             ('clf', _models[clf])])
    else:
        clf_pipe = Pipeline([('dict_vector', feature_vectorizer),
                             ('feature_selector', feature_selector),
                             ('clf', _models[clf])])

    feat_vec, labels = obtain_vectors(file_name, load_vec, train_reader,
                                      features)

    return clf_pipe, feat_vec, labels
Ejemplo n.º 23
0
def get_ensemble_model(w2v=None):

    if not w2v:
        glove = Glove.load()
        w2v = glove.get_dict()

    n_jobs = -1
    return Pipeline([
        ('feature_extraction', get_features(w2v)),
        # false positive rate test for feature selection
        ('feature_selection', SelectFpr(f_classif)),
        #('normalize', Normalizer(norm='l2')),
        (
            'proba',
            ProbExtractor([
                RandomForestClassifier(n_estimators=300,
                                       max_depth=10,
                                       min_samples_split=5,
                                       n_jobs=n_jobs),
                #                                 ExtraTreesClassifier(n_estimators=300, max_depth=10,
                #                                                      min_samples_split=10,
                #                                                      n_jobs=n_jobs),
                XGBClassifier(n_estimators=300, max_depth=10, n_jobs=8),
                LogisticRegression(C=0.1,
                                   solver='lbfgs',
                                   penalty='l2',
                                   n_jobs=n_jobs),
                BernoulliNB(alpha=5.0)
            ])),
        ('polynomial', PolynomialFeatures(degree=2)),
        ('logistic_regression',
         GridSearchCV(LogisticRegression(penalty='l2', random_state=42),
                      param_grid=params))
    ])
Ejemplo n.º 24
0
def test_select_heuristics_regression():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple regression problem
    # with the fpr, fdr or fwe heuristics
    X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0, noise=10)

    univariate_filter = SelectFpr(f_regression, alpha=0.01)
    X_r = univariate_filter.fit(X, y).transform(X)
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    for mode in ["fdr", "fpr", "fwe"]:
        X_r2 = GenericUnivariateSelect(f_regression, mode=mode, param=0.01).fit(X, y).transform(X)
        assert_array_equal(X_r, X_r2)
        support = univariate_filter.get_support()
        assert_array_equal(support[:5], np.ones((5,), dtype=np.bool))
        assert_less(np.sum(support[5:] == 1), 3)
Ejemplo n.º 25
0
def test_clone():
    # Tests that clone creates a correct deep copy.
    # We create an estimator, make a copy of its original state
    # (which, in this case, is the current state of the estimator),
    # and check that the obtained copy is a correct deep copy.

    from sklearn.feature_selection import SelectFpr, f_classif

    selector = SelectFpr(f_classif, alpha=0.1)
    new_selector = clone(selector)
    assert selector is not new_selector
    assert selector.get_params() == new_selector.get_params()

    selector = SelectFpr(f_classif, alpha=np.zeros((10, 2)))
    new_selector = clone(selector)
    assert selector is not new_selector
Ejemplo n.º 26
0
def test_clone():
    # Tests that clone creates a correct deep copy.
    # We create an estimator, make a copy of its original state
    # (which, in this case, is the current state of the estimator),
    # and check that the obtained copy is a correct deep copy.

    from sklearn.feature_selection import SelectFpr, f_classif

    selector = SelectFpr(f_classif, alpha=0.1)
    new_selector = clone(selector)
    assert selector is not new_selector
    assert_equal(selector.get_params(), new_selector.get_params())

    selector = SelectFpr(f_classif, alpha=np.zeros((10, 2)))
    new_selector = clone(selector)
    assert selector is not new_selector
Ejemplo n.º 27
0
def get_feature_extractor(w2v=None):
    if not w2v:
        glove = Glove.load()
        w2v = glove.get_dict()

    return Pipeline([("feature_extraction", get_features(w2v)),
                     ('feature_selection', SelectFpr(f_classif))
                     ])
Ejemplo n.º 28
0
def test_select_fpr_regression():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple regression problem
    with the fpr heuristic
    """
    X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectFpr(f_regression, alpha=0.01)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_regression, mode="fpr", param=0.01).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert (support[:5] == 1).all()
    assert np.sum(support[5:] == 1) < 3
Ejemplo n.º 29
0
def feature_Univarselection(data, y, Alpha):
    xx = data.sort_values('pid').values
    xx_label = y.sort_values('pid')[sep].values
    select = SelectFpr(f_classif, alpha=Alpha).fit(xx, xx_label)
    # select = SelectFdr(f_classif, alpha=Alpha).fit(xx,xx_label)
    # select = SelectFwe(f_classif, alpha=Alpha).fit(xx,xx_label)
    # select = SelectKBest(chi2, k=num_feature).fit(xx,xx_label)
    # select = SelectFromModel(estimator=Lasso(), threshold=-np.inf, max_features=num_feature).fit(data,y)
    reduced_xx = select.transform(xx)
    new_data = select.inverse_transform(reduced_xx)
    new_data = pd.DataFrame(new_data,
                            index=data.sort_values('pid').index,
                            columns=data.sort_values('pid').columns)
    # idx = select.get_support()
    # print(idx)
    # new_data = np.delete(new_data,idx,1)
    return new_data
Ejemplo n.º 30
0
def test_select_heuristics_regression():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple regression problem
    # with the fpr, fdr or fwe heuristics
    X, y = make_regression(n_samples=200, n_features=20, n_informative=5,
                           shuffle=False, random_state=0, noise=10)

    univariate_filter = SelectFpr(f_regression, alpha=0.01)
    X_r = univariate_filter.fit(X, y).transform(X)
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    for mode in ['fdr', 'fpr', 'fwe']:
        X_r2 = GenericUnivariateSelect(
            f_regression, mode=mode, param=0.01).fit(X, y).transform(X)
        assert_array_equal(X_r, X_r2)
        support = univariate_filter.get_support()
        assert_array_equal(support[:5], np.ones((5, ), dtype=np.bool))
        assert_less(np.sum(support[5:] == 1), 3)
def test_verbose_output_for_select_select_fpr():
    expected_output = ("The p-value of column 'B' (1.0000) is above the " +
                       "specified alpha of 0.5000")

    model = SelectFpr(chi2, alpha=0.5)

    output = _capture_verbose_output_for_model(model, use_supervised_df=True)

    assert output == expected_output
Ejemplo n.º 32
0
def test_select_fpr_regression():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple regression problem
    with the fpr heuristic
    """
    X, y = make_regression(n_samples=200, n_features=20,
                           n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectFpr(f_regression, alpha=0.01)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_regression, mode='fpr',
                    param=0.01).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert(support[:5] == 1).all()
    assert(np.sum(support[5:] == 1) < 3)
Ejemplo n.º 33
0
def selectFpr(args):
    """Uses scikit-learn's SelectFpr, select the pvalues below alpha based on a FPR test.
        
    Parameters
    ----------

    score_func : callable
        Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues).

    alpha : float, optional
        The highest uncorrected p-value for features to keep.

    """

    if (args[2] == "chi2"):
        selector = SelectFpr(chi2, alpha=float(args[1]))
    elif (args[2] == "f_classif"):
        selector = SelectFpr(f_classif, alpha=float(args[1]))

    return selector
 def test_select_fpr_float(self):
     model = SelectFpr()
     X = np.array(
         [[1, 2, 3, 1], [0, 3, 1, 4], [3, 5, 6, 1], [1, 2, 1, 5]],
         dtype=np.float32,
     )
     y = np.array([0, 1, 0, 1])
     model.fit(X, y)
     model_onnx = convert_sklearn(
         model, "select fpr", [("input", FloatTensorType([1, X.shape[1]]))])
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         X,
         model,
         model_onnx,
         basename="SklearnSelectFpr",
         allow_failure="StrictVersion(onnx.__version__)"
         " < StrictVersion('1.2') or "
         "StrictVersion(onnxruntime.__version__)"
         " <= StrictVersion('0.2.1')",
     )
Ejemplo n.º 35
0
def feature_method_selection(data, label, fsname):
    """
    select features by option 'fsname'
    :param data:
    :param label:
    :param fsname:
    :return: new_data, selected data
    :return: selected_features_inx, the index of selected feature, starts with 0
    """
    if fsname == 'variance_threshold': #变化不大就舍弃,离散值
        model = VarianceThreshold() #th=1
        return model.fit_transform(data)

    elif fsname == 'select_kbest':
        model = SelectKBest(chi2, k=10) #特征值必须非负,chi2是分类

    elif fsname == 'rfe':#递归消除,耗时很长
        svc = SVC(kernel='linear', C=1)
        model = RFE(estimator=svc, n_features_to_select=10, step=1)

    elif fsname == 'rfecv': #交叉验证执行执行REF,label必须是数值
        svc = SVC(kernel="linear")
        rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(label, 1),
                      scoring='accuracy')

    elif fsname == 'RandLasso':#打乱重新选择,cannot perform reduce with flexible type
        model = RandomizedLogisticRegression()


    elif fsname == 'linear_svc':
        model = LinearSVC() #没有importance

    elif fsname == 'tree':
        model = ExtraTreesClassifier()

    elif fsname == 'fclassif':
        model = SelectFpr() #默认是f_classif,值越大,特征越有用

    elif fsname == 'pearsonr': #label必须是数值
        label = turn_label_2num(label)#结果是两个sample的相关性
        res = pearsonr(data,label)

    elif fsname == 'RandForReg': #label必须是数值
        label = turn_label_2num(label)
        model = RandomForestRegressor()

    else:
        logging.error('ERROR: feature selection option is wrong')

    model.fit(data, label)
    new_data = model.transform(data)  # selected importanted data

    return new_data
Ejemplo n.º 36
0
def fval(df, y, alpha, k):
    """Feature Selection based on F-Value

    :param df: dataframe
    :param y: label
    :param alpha: hyper-parameter [alpha]
    :param k: number of select features
    :return: dataframe of feature selected
    """
    x_bin = MinMaxScaler().fit_transform(scale(df))
    select_chi2 = SelectFpr(chi2, alpha=alpha).fit(x_bin, y)
    select_f_classif = SelectFpr(f_classif, alpha=alpha).fit(df, y)

    chi2_selected = select_chi2.get_support()
    f_classif_selected = select_f_classif.get_support()
    chi2_selected_features = [
        f for i, f in enumerate(df.columns) if chi2_selected[i]
    ]
    logging.info('Chi2 selected {} features {}.'.format(
        chi2_selected.sum(), chi2_selected_features))

    f_classif_selected_features = [
        f for i, f in enumerate(df.columns) if f_classif_selected[i]
    ]
    logging.info('F_classif selected {} features {}.'.format(
        f_classif_selected.sum(), f_classif_selected_features))
    selected = chi2_selected & f_classif_selected
    logging.info('Chi2 & F_classif selected {} features'.format(
        selected.sum()))
    features = [f for f, s in zip(df.columns, selected) if s]
    logging.info(features)
    return df[features]
Ejemplo n.º 37
0
def multisplit(skf, X, y, stepsize=1000):
    total_score = 0
    for train_index, test_index in skf:
        wl = []
        pred1 = np.matrix([])
        # Training
        for x in range(0, len(X[0]), stepsize):
            clf1 = plib.classif(X[train_index, x:x + stepsize], y[train_index])
            tmp_p = np.matrix(
                clf1.decision_function(X[train_index, x:x + stepsize]))
            if pred1.size == 0:
                pred1 = tmp_p
            else:
                pred1 = np.concatenate((pred1, tmp_p), axis=1)
            wl.append(clf1)
        #selectf = SelectKBest(f_classif, k=5).fit(pred1, y[train_index])
        selectf = SelectFpr().fit(pred1, y[train_index])
        clf3 = AdaBoostClassifier(n_estimators=100)
        #clf3 = svm.SVC(class_weight='auto')
        #clf3 = RandomForestClassifier(n_estimators=20)
        clf3.fit(selectf.transform(pred1), y[train_index])
        # Testing
        predtest = np.matrix([])
        k = 0
        for x in range(0, len(X[0]), stepsize):
            tmp_p = np.matrix(wl[k].decision_function(X[test_index,
                                                        x:x + stepsize]))
            if predtest.size == 0:
                predtest = tmp_p
            else:
                predtest = np.concatenate((predtest, tmp_p), axis=1)
            k += 1
        # Final prediction
        predfinal = clf3.predict(selectf.transform(predtest))
        print "Target     : ", y[test_index]
        print "Prediction : ", predfinal
        matchs = np.equal(predfinal, y[test_index])
        score = np.divide(np.sum(matchs), np.float64(matchs.size))
        total_score = score + total_score
    return np.divide(total_score, skf.n_folds)
Ejemplo n.º 38
0
def correlation(df, y, threshold, alpha, corr_k_pass, mode):
    """Feature selection based on correlation between features

    :param df: dataframe
    :param y: label
    :param threshold: select feature threshold
    :param alpha: hyper-parameter [alpha]
    :param corr_k_pass: correlation threshold
    :param mode: feature selection based on static method
    :return: dataframe of feature selected
    """
    df_out = df.corr()
    col_pass = []
    del_col = []
    if mode == "chi2":
        filter_slect = chi2
    elif mode == "f":
        filter_slect = f_classif
    else:
        raise Exception("No mode: " % mode)
    if alpha:
        x_bin = MinMaxScaler().fit_transform(scale(df))
        fpval = SelectFpr(filter_slect, alpha=alpha).fit(x_bin, y).scores_
        df_sort_fval = pd.DataFrame({
            "col": list(df.columns),
            "fval": list(fpval)
        })
        df_sort_fval = df_sort_fval.sort_values(by=['fval'], ascending=False)
        ranking_col = list(df_sort_fval['col'])
    else:
        ranking_col = list(df.columns)

    for i, col in enumerate(ranking_col):
        if col not in del_col:
            col_pass.append(col)
            del_col = list(
                set(del_col +
                    (list(df_out[col][(df_out[col] > threshold)
                                      | (df_out[col] < -threshold)].index))))
        else:
            del_col = list(
                set(del_col +
                    (list(df_out[col][(df_out[col] > threshold)
                                      | (df_out[col] < -threshold)].index))))
    del df_out

    logging.info("Del col : %d" % len(del_col))
    logging.info("Passed col : %d" % len(col_pass))
    if corr_k_pass:
        if len(col_pass) > corr_k_pass:
            col_pass = col_pass[:corr_k_pass]
    return df[col_pass]
Ejemplo n.º 39
0
def train_DT(
    feats=None,
    labels=[],
    feature_selector=SelectFpr(
        chi2, alpha=0.05),  # Use None to stop feature selection
    cv=5):  # Number of folds used in cross-validation
    # Map the count dictionaries to a sparse feature matrix:
    vectorizer = DictVectorizer(sparse=False)
    feats = vectorizer.fit_transform(feats)
    ##### FEATURE SELECTION
    feat_matrix = feats
    feature_selector = RFE(estimator=MultinomialNB(),
                           n_features_to_select=None,
                           step=1,
                           verbose=0)
    feat_matrix = feature_selector.fit_transform(feats, labels)

    ##### HYPER-PARAMETER SEARCH
    # Define the basic model to use for parameter search:
    searchmod = DecisionTreeClassifier()
    # Parameters to grid-search over:
    parameters = {
        'splitter': ['best', 'random'],
        'max_features': ['sqrt', 0.25, 'log2'],
        'min_samples_split': [2, 5, 10]
    }
    # Cross-validation grid search to find the best hyper-parameters:
    clf = GridSearchCV(searchmod, parameters, cv=cv, n_jobs=-1)
    clf.fit(feat_matrix, labels)
    params = clf.best_params_

    # Establish the model we want using the parameters obtained from the search:
    mod = DecisionTreeClassifier(splitter=params['splitter'],
                                 max_features=params['max_features'],
                                 min_samples_split=params['min_samples_split'])
    ##### ASSESSMENT
    scores = cross_val_score(mod,
                             feat_matrix,
                             labels,
                             cv=cv,
                             scoring="f1_macro")
    print 'Best model', mod
    print '%s features selected out of %s total' % (feat_matrix.shape[1],
                                                    feats.shape[1])
    print 'F1 mean: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2)

    # TRAIN OUR MODEL:
    mod.fit(feat_matrix, labels)
    # Return the trained model along with the objects we need to
    # featurize test data in a way that aligns with our training
    # matrix:
    return (mod, vectorizer, feature_selector)
Ejemplo n.º 40
0
def train_decisiontree_FPR(configurationname, train_data, score_function, undersam=False, oversam=False, export=False):
    print("Training with configuration " + configurationname)
    X_train, y_train, id_to_a_train = train_data
    dtc = DecisionTreeClassifier(random_state=0)

    print("Feature Selection")
    # selector = SelectFpr(score_function)
    selector = SelectFpr(score_function)
    result = selector.fit(X_train, y_train)
    X_train = selector.transform(X_train)

    fitted_ids = [i for i in result.get_support(indices=True)]

    print("Apply Resampling")
    print(Counter(y_train))
    if undersam and not oversam:
        renn = RepeatedEditedNearestNeighbours()
        X_train, y_train = renn.fit_resample(X_train, y_train)
    if oversam and not undersam:
        # feature_indices_array = list(range(len(f_to_id)))
        # smote_nc = SMOTENC(categorical_features=feature_indices_array, random_state=0)
        # X_train, y_train = smote_nc.fit_resample(X_train, y_train)
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)
    if oversam and undersam:
        smote_enn = SMOTEENN(random_state=0)
        X_train, y_train = smote_enn.fit_resample(X_train, y_train)
    print(Counter(y_train))

    print("Train Classifier")
    dtc = dtc.fit(X_train, y_train, check_input=True)

    if export:
        export_graphviz(dtc, out_file=DATAP + "/temp/trees/sltree_" + configurationname + ".dot", filled=True)
        transform(fitted_ids)

    print("Self Accuracy: " + str(dtc.score(X_train, y_train)))

    return selector, dtc
Ejemplo n.º 41
0
def multisplit(skf,X,y,stepsize=1000):
    total_score = 0
    for train_index, test_index in skf:
        wl = []
        pred1 = np.matrix([])
        # Training
        for x in range(0, len(X[0]), stepsize):
            clf1 = plib.classif(X[train_index, x:x + stepsize], y[train_index])
            tmp_p = np.matrix(clf1.decision_function(X[train_index, x:x + stepsize]))
            if pred1.size == 0:
                pred1 = tmp_p
            else:
                pred1 = np.concatenate((pred1, tmp_p), axis=1)
            wl.append(clf1)
        #selectf = SelectKBest(f_classif, k=5).fit(pred1, y[train_index])
        selectf = SelectFpr().fit(pred1, y[train_index])
        clf3 = AdaBoostClassifier(n_estimators=100)
        #clf3 = svm.SVC(class_weight='auto')
        #clf3 = RandomForestClassifier(n_estimators=20)
        clf3.fit(selectf.transform(pred1), y[train_index])
        # Testing
        predtest = np.matrix([])
        k = 0
        for x in range(0, len(X[0]), stepsize):
            tmp_p = np.matrix(wl[k].decision_function(X[test_index, x:x + stepsize]))
            if predtest.size == 0:
                predtest = tmp_p
            else:
                predtest = np.concatenate((predtest, tmp_p), axis=1)
            k += 1
        # Final prediction
        predfinal = clf3.predict(selectf.transform(predtest))
        print "Target     : ", y[test_index]
        print "Prediction : ", predfinal
        matchs = np.equal(predfinal, y[test_index])
        score = np.divide(np.sum(matchs), np.float64(matchs.size))
        total_score = score + total_score
    return np.divide(total_score, skf.n_folds)
from sklearn.feature_selection import VarianceThreshold, SelectFpr, f_regression

# import data of all Count and Position features. Training and test sets altogether
dfCountfeatures = pd.read_csv('data/CountingAndPositionFeatures_TrainAndTestData.csv')
dfTrainRaw = pd.read_csv('data/train.csv')

# get only training data  
TrainQueryIDs = dfTrainRaw["id"]
relevance = dfTrainRaw["relevance"]
dfCountfeatures_TrainSet = dfCountfeatures[dfCountfeatures["id"].isin(TrainQueryIDs)]
#select these features which have non-zero variance
selector = VarianceThreshold()
selector.fit_transform(dfCountfeatures_TrainSet).shape # only one feature with zero variance - shape (74067L, 262L)

# select feature based on p-values from univariate regression with target feature (relevance)
selector2= SelectFpr(f_regression, alpha = 0.01)
selector2.fit(dfCountfeatures_TrainSet.drop("id", axis = 1), relevance)
selector2.get_support(indices=True).size # left 226 features out of 262 with p-value <=1%
# get titles of features which were selected
selectedCountfeatures = dfCountfeatures.columns[selector2.get_support(indices=True)]

# check correlation amongst features
corrReduced = dfCountfeatures_TrainSet[selectedCountfeatures].corr()
corrReduced.iloc[:,:] = np.tril(corrReduced.values, k=-1)
corrReduced =corrReduced.stack()
# get pairs of features which are highly correlated
corrReduced[corrReduced.abs()>0.8].size # 578 pairs correlated more than 80% out of 25.425
len(set(corrReduced[corrReduced.abs()>0.8].index.labels[0])) # 172 features to be removed due to high correlation with other features
# get feature titles which will be used in training the model after removing highly correlated features
indices = set(corrReduced[corrReduced.abs()>0.8].index.labels[0])
selectedCountfeatures2 = [i for j, i in enumerate(selectedCountfeatures.tolist()) if j not in indices]
Ejemplo n.º 43
0
y = iris.target

################################################################################
pl.figure(1)
pl.clf()

x_indices = np.arange(x.shape[-1])

################################################################################
# Univariate feature selection
from sklearn.feature_selection import SelectFpr, f_classif
# As a scoring function, we use a F test for classification
# We use the default selection function: the 10% most significant
# features

selector = SelectFpr(f_classif, alpha=0.1)
selector.fit(x, y)
scores = -np.log10(selector._pvalues)
scores /= scores.max()
pl.bar(x_indices-.45, scores, width=.3,
        label=r'Univariate score ($-Log(p_{value})$)',
        color='g')

################################################################################
# Compare to the weights of an SVM
clf = svm.SVC(kernel='linear')
clf.fit(x, y)

svm_weights = (clf.coef_**2).sum(axis=0)
svm_weights /= svm_weights.max()
pl.bar(x_indices-.15, svm_weights, width=.3, label='SVM weight',
Ejemplo n.º 44
0
#SelectPercentile -- chi2
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import chi2
X_fitted_4 = SelectPercentile(chi2, percentile=50).fit(X,y)
print "SelectPercentile -- chi2"
print X_fitted_4.scores_
print X_fitted_4.pvalues_
print X_fitted_4.get_support()
X_transformed_4 = X_fitted_4.transform(X)
print X_transformed_4.shape

#SelectFpr --- chi2
from sklearn.feature_selection import SelectFpr
from sklearn.feature_selection import chi2
X_fitted_5 = SelectFpr(chi2, alpha=2.50017968e-15).fit(X,y)
print "SelectFpr --- chi2"
print X_fitted_5.scores_
print X_fitted_5.pvalues_
print X_fitted_5.get_support()
X_transformed_5 = X_fitted_5.transform(X)
print X_transformed_5.shape

#SelectFpr --- f_classif
from sklearn.feature_selection import SelectFpr
from sklearn.feature_selection import f_classif
X_fitted_6 = SelectFpr(f_classif, alpha=1.66966919e-31 ).fit(X,y)
print "SelectFpr --- f_classif"
print X_fitted_6.scores_
print X_fitted_6.pvalues_
print X_fitted_6.get_support()
Ejemplo n.º 45
0
data1 = pdc.objFeatures[tr1_mask][:, featureIds]
data2 = pdc.objFeatures[tr2_mask][:, featureIds]
data = np.vstack([data1, data2])
labels1 = np.zeros((data1.shape[0],))
labels2 = np.ones((data2.shape[0],))
labels = np.hstack([labels1, labels2])
X1 = data1[:1000]
X2 = data2[-1000:]
X = np.vstack([X1, X2])
Y1 = labels1[:X1.shape[0]]
Y2 = labels2[:X2.shape[0]]
Y = np.hstack([Y1, Y2])

from sklearn.feature_selection import SelectFpr, f_classif

selector = SelectFpr(f_classif, alpha=0.1)
selector.fit(X, Y)
scores = -np.log10(selector._pvalues)
scores /= scores.max()

from sklearn import svm
# Compare to the weights of an SVM
clf = svm.SVC(kernel='linear')
clf.fit(X, Y)
print 'SVM error:', clf.score(data, labels)
pred = clf.predict(data)
match = numpy.sum(pred == labels)
print match, labels.shape[0]
print match / float(labels.shape[0])

svm_weights = (clf.coef_**2).sum(axis=0)