Example #1
0
def fval(df, y, alpha, k):
    """Feature Selection based on F-Value

    :param df: dataframe
    :param y: label
    :param alpha: hyper-parameter [alpha]
    :param k: number of select features
    :return: dataframe of feature selected
    """
    x_bin = MinMaxScaler().fit_transform(scale(df))
    select_chi2 = SelectFpr(chi2, alpha=alpha).fit(x_bin, y)
    select_f_classif = SelectFpr(f_classif, alpha=alpha).fit(df, y)

    chi2_selected = select_chi2.get_support()
    f_classif_selected = select_f_classif.get_support()
    chi2_selected_features = [
        f for i, f in enumerate(df.columns) if chi2_selected[i]
    ]
    logging.info('Chi2 selected {} features {}.'.format(
        chi2_selected.sum(), chi2_selected_features))

    f_classif_selected_features = [
        f for i, f in enumerate(df.columns) if f_classif_selected[i]
    ]
    logging.info('F_classif selected {} features {}.'.format(
        f_classif_selected.sum(), f_classif_selected_features))
    selected = chi2_selected & f_classif_selected
    logging.info('Chi2 & F_classif selected {} features'.format(
        selected.sum()))
    features = [f for f, s in zip(df.columns, selected) if s]
    logging.info(features)
    return df[features]
Example #2
0
    def fit(self, X, y, sample_weight=None):
        if self.allow_missing_ids is None:
            self.allow_missing_ids = np.zeros(X.shape[1]).astype(bool)

        if self.univariate_feature_selection:
            # univariate feature selection
            feature_selector = SelectFpr(alpha=0.05).fit(
                X[:, ~self.allow_missing_ids], y)
            self.support = np.ones(X.shape[1]).astype(bool)
            self.support[~self.
                         allow_missing_ids] = feature_selector.get_support()
            X = X[:, self.support]
        else:
            self.support = np.ones(X.shape[1]).astype(bool)

        # fit the model
        super().fit(X, y, [len(X)], sample_weight=sample_weight)

        # get the mean of z for each level of y
        self.label_encoder = LabelEncoder().fit(y)
        self.classes_ = self.label_encoder.classes_
        z = super().predict(X).astype(float)
        self.z_means = np.array(
            [z[y == cl].mean() for cl in self.label_encoder.classes_])
        return self
Example #3
0
class f_regressionFPRPrim(primitive):
    def __init__(self, random_state=0):
        super(f_regressionFPRPrim, self).__init__(name='f_regressionFPR')
        self.id = 29
        self.PCA_LAPACK_Prim = []
        self.type = 'feature selection'
        self.description = "Filter: Select the pvalues below alpha based on a FPR test with F-value between label/feature for regression tasks. FPR test stands for False Positive Rate test. It controls the total amount of false detections."
        self.hyperparams_run = {'default': True}
        self.selector = None
        self.accept_type = 'c_r'

    def can_accept(self, data):
        return self.can_accept_c(data, 'Regression')

    def is_needed(self, data):
        if data['X'].shape[1] < 3:
            return False
        return True

    def fit(self, data):
        data = handle_data(data)
        self.selector = SelectFpr(f_regression)
        self.selector.fit(data['X'], data['Y'])

    def produce(self, data):
        output = handle_data(data)
        cols = list(output['X'].columns)
        mask = self.selector.get_support(indices=False)
        final_cols = list(compress(cols, mask))
        output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols)
        final_output = {0: output}
        return final_output
def test_boundary_case_ch2():
    # Test boundary case, and always aim to select 1 feature.
    X = np.array([[10, 20], [20, 20], [20, 30]])
    y = np.array([[1], [0], [0]])
    scores, pvalues = chi2(X, y)
    assert_array_almost_equal(scores, np.array([4., 0.71428571]))
    assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472]))

    filter_fdr = SelectFdr(chi2, alpha=0.1)
    filter_fdr.fit(X, y)
    support_fdr = filter_fdr.get_support()
    assert_array_equal(support_fdr, np.array([True, False]))

    filter_kbest = SelectKBest(chi2, k=1)
    filter_kbest.fit(X, y)
    support_kbest = filter_kbest.get_support()
    assert_array_equal(support_kbest, np.array([True, False]))

    filter_percentile = SelectPercentile(chi2, percentile=50)
    filter_percentile.fit(X, y)
    support_percentile = filter_percentile.get_support()
    assert_array_equal(support_percentile, np.array([True, False]))

    filter_fpr = SelectFpr(chi2, alpha=0.1)
    filter_fpr.fit(X, y)
    support_fpr = filter_fpr.get_support()
    assert_array_equal(support_fpr, np.array([True, False]))

    filter_fwe = SelectFwe(chi2, alpha=0.1)
    filter_fwe.fit(X, y)
    support_fwe = filter_fwe.get_support()
    assert_array_equal(support_fwe, np.array([True, False]))
Example #5
0
class UnivariateSelectChiFPRPrim(primitive):
    def __init__(self, random_state=0):
        super(UnivariateSelectChiFPRPrim, self).__init__(name='UnivariateSelectChiFPR')
        self.id = 27
        self.PCA_LAPACK_Prim = []
        self.type = 'feature selection'
        self.description = "Filter: Select the pvalues below alpha based on a FPR test with Chi-square. FPR test stands for False Positive Rate test. It controls the total amount of false detections."
        self.hyperparams_run = {'default': True}
        self.selector = None
        self.accept_type = 'd'

    def can_accept(self, data):
        return self.can_accept_d(data, 'Classification')

    def is_needed(self, data):
        if data['X'].shape[1] < 3:
            return False
        return True

    def fit(self, data):
        data = handle_data(data)
        self.selector = SelectFpr(chi2, alpha=0.05)
        self.selector.fit(data['X'], data['Y'])

    def produce(self, data):
        output = handle_data(data)
        cols = list(output['X'].columns)
        try:
            mask = self.selector.get_support(indices=False)
            final_cols = list(compress(cols, mask))
            output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols)
        except Exception as e:
            print(e)
        final_output = {0: output}
        return final_output
def test_select_fpr_classif():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple classification problem
    with the fpr heuristic
    """
    X, Y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )

    univariate_filter = SelectFpr(f_classif, alpha=0.0001)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode="fpr", param=0.0001).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
def test_boundary_case_ch2():
    # Test boundary case, and always aim to select 1 feature.
    X = np.array([[10, 20], [20, 20], [20, 30]])
    y = np.array([[1], [0], [0]])
    scores, pvalues = chi2(X, y)
    assert_array_almost_equal(scores, np.array([4.0, 0.71428571]))
    assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472]))

    filter_fdr = SelectFdr(chi2, alpha=0.1)
    filter_fdr.fit(X, y)
    support_fdr = filter_fdr.get_support()
    assert_array_equal(support_fdr, np.array([True, False]))

    filter_kbest = SelectKBest(chi2, k=1)
    filter_kbest.fit(X, y)
    support_kbest = filter_kbest.get_support()
    assert_array_equal(support_kbest, np.array([True, False]))

    filter_percentile = SelectPercentile(chi2, percentile=50)
    filter_percentile.fit(X, y)
    support_percentile = filter_percentile.get_support()
    assert_array_equal(support_percentile, np.array([True, False]))

    filter_fpr = SelectFpr(chi2, alpha=0.1)
    filter_fpr.fit(X, y)
    support_fpr = filter_fpr.get_support()
    assert_array_equal(support_fpr, np.array([True, False]))

    filter_fwe = SelectFwe(chi2, alpha=0.1)
    filter_fwe.fit(X, y)
    support_fwe = filter_fwe.get_support()
    assert_array_equal(support_fwe, np.array([True, False]))
def test_select_fpr_classif():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple classification problem
    with the fpr heuristic
    """
    X, y = make_classification(n_samples=200,
                               n_features=20,
                               n_informative=3,
                               n_redundant=2,
                               n_repeated=0,
                               n_classes=8,
                               n_clusters_per_class=1,
                               flip_y=0.0,
                               class_sep=10,
                               shuffle=False,
                               random_state=0)

    univariate_filter = SelectFpr(f_classif, alpha=0.0001)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode='fpr',
                                   param=0.0001).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
def select_with_fpr(train, test):
  train_data = train.drop('ID', axis=1)
  test_data = test.drop('ID', axis=1)

  train_y = train_data['TARGET']
  train_X = train_data.drop('TARGET', 1)

  fpr = SelectFpr(alpha = 0.001)

  features = fpr.fit_transform(train_X, train_y)

  print('Fpr выбрал {} признаков.'.format(features.shape[1]))

  col_numbers = fpr.get_support()
  columns = np.delete(train_data.columns.values, train_data.shape[1] - 1, axis=0)
  features = []
  i = 0
  for i in range(len(columns)):
    if col_numbers[i] == True:
      features.append(columns[i])

  new_train = train[['ID'] + features + ['TARGET']]
  new_train.to_csv('train_after_fpr.csv')

  new_test = test[['ID'] + features]
  new_test.to_csv('test_after_fpr.csv')
def selectionFwe(X, y, paramlist):
    k = paramlist['number _of_features']
    fwe = SelectFpr(chi2, k=k)
    Xnew = fwe.fit_transform(X, y)
    indexarr = fwe.get_support(indices=True)
    scores_arr = fwe.scores_
    return [Xnew, indexarr, scores_arr]
Example #11
0
    def fit(self, X, y, sample_weight=None):
        self.label_encoder = LabelEncoder().fit(y)
        self.classes_ = self.label_encoder.classes_
        y = self.label_encoder.transform(y)

        if self.allow_missing_ids is None:
            self.allow_missing_ids = np.zeros(X.shape[1]).astype(bool)

        if self.univariate_feature_selection:
            # univariate feature selection
            feature_selector = SelectFpr(alpha=0.05).fit(
                X[:, ~self.allow_missing_ids], y)
            self.support = np.ones(X.shape[1]).astype(bool)
            self.support[~self.
                         allow_missing_ids] = feature_selector.get_support()
            X = X[:, self.support]
            if self.bounds is not None:
                self.bounds = [
                    self.bounds[ii] for ii in range(len(self.bounds))
                    if self.support[ii]
                ]
        else:
            self.support = np.ones(X.shape[1]).astype(bool)

        def func(w, X, y, alpha, sw):
            out, grad = _logistic_loss_and_grad(w, X, y, 0, sw)
            out_penalty = alpha * np.sum(np.abs(w[:-1]))
            grad_penalty = np.r_[alpha * np.sign(w[:-1]), 0]
            return out + out_penalty, grad + grad_penalty

        y2 = np.array(y)
        y2[y2 == 0] = -1
        w0 = np.r_[np.random.randn(X.shape[1]) / 10, 0.]
        if self.bounds is None:
            method = 'BFGS'
        else:
            method = 'L-BFGS-B'
        if sample_weight is None:
            if self.class_weight is not None:
                sample_weight = get_sample_weights(
                    y, class_weight=self.class_weight)
            else:
                sample_weight = np.ones(len(X))
        sample_weight /= (np.mean(sample_weight) * len(X))
        self.opt_res = minimize(func,
                                w0,
                                method=method,
                                jac=True,
                                args=(X, y2, 1. / self.C, sample_weight),
                                bounds=self.bounds + [(None, None)],
                                options={
                                    "gtol": self.tol,
                                    "maxiter": self.max_iter
                                })
        self.coef_ = np.zeros(len(self.support))
        self.coef_[self.support] = self.opt_res.x[:-1]
        self.coef_ = self.coef_.reshape(1, -1)
        self.intercept_ = self.opt_res.x[-1].reshape(1, )
        return self
Example #12
0
def SelectFpr_selector(data, target, sf):
    selector = SelectFpr(score_func=sf)
    data_new = selector.fit_transform(data.values, target.values.ravel())
    outcome = selector.get_support(True)
    new_features = []  # The list of your K best features
    for ind in outcome:
        new_features.append(data.columns.values[ind])
    return pd.DataFrame(data_new, columns=new_features)
Example #13
0
    def fit(self, X, y, sample_weight=None):
        self.fitted_ = False
        if self.allow_missing_ids is None:
            self.allow_missing_ids = np.zeros(X.shape[1]).astype(bool)

        Xold = np.array(X)
        if self.univariate_feature_selection:
            # univariate feature selection
            feature_selector = SelectFpr(alpha=0.05).fit(
                X[:, ~self.allow_missing_ids], y)
            self.support = np.ones(X.shape[1]).astype(bool)
            self.support[~self.
                         allow_missing_ids] = feature_selector.get_support()
            X = X[:, self.support]
            self.allow_missing_ids = self.allow_missing_ids[self.support]
        else:
            self.support = np.ones(X.shape[1]).astype(bool)

        if sample_weight is None:
            if self.class_weight is not None:
                sample_weight = get_sample_weights(
                    y, class_weight=self.class_weight)
            else:
                sample_weight = np.ones(len(X))
        sample_weight /= (np.mean(sample_weight) * len(X))

        # generate pairs
        X2, y2, sw2 = self._generate_pairs(X, y, sample_weight)
        sw2 = sw2 / sw2.mean()
        if self.verbose:
            print('Generated %d pairs from %d samples' % (len(X2), len(X)))

        # fit the model
        if self.estimator.bounds is not None:
            self.estimator.bounds = [
                self.estimator.bounds[ii]
                for ii in range(len(self.estimator.bounds)) if self.support[ii]
            ]
        self.estimator.fit(X2, y2, sample_weight=sw2)

        # get the mean of z for each level of y
        self.label_encoder = LabelEncoder().fit(y)
        self.classes_ = self.label_encoder.classes_
        z = self.predict_z(Xold)
        self.z_means = np.array(
            [z[y == cl].mean() for cl in self.label_encoder.classes_])

        self.coef_ = np.zeros(len(self.support))
        self.coef_[self.support] = self.estimator.coef_.flatten()
        self.coef_ = self.coef_.reshape(1, -1)
        self.intercept_ = self.estimator.intercept_
        self.fitted_ = True
        return self
def test_select_heuristics_regression():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple regression problem
    # with the fpr, fdr or fwe heuristics
    X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0, noise=10)

    univariate_filter = SelectFpr(f_regression, alpha=0.01)
    X_r = univariate_filter.fit(X, y).transform(X)
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    for mode in ["fdr", "fpr", "fwe"]:
        X_r2 = GenericUnivariateSelect(f_regression, mode=mode, param=0.01).fit(X, y).transform(X)
        assert_array_equal(X_r, X_r2)
        support = univariate_filter.get_support()
        assert_array_equal(support[:5], np.ones((5,), dtype=np.bool))
        assert_less(np.sum(support[5:] == 1), 3)
def test_select_fpr_regression():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple regression problem
    with the fpr heuristic
    """
    X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectFpr(f_regression, alpha=0.01)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_regression, mode="fpr", param=0.01).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert (support[:5] == 1).all()
    assert np.sum(support[5:] == 1) < 3
Example #16
0
def test_select_heuristics_regression():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple regression problem
    # with the fpr, fdr or fwe heuristics
    X, y = make_regression(n_samples=200, n_features=20, n_informative=5,
                           shuffle=False, random_state=0, noise=10)

    univariate_filter = SelectFpr(f_regression, alpha=0.01)
    X_r = univariate_filter.fit(X, y).transform(X)
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    for mode in ['fdr', 'fpr', 'fwe']:
        X_r2 = GenericUnivariateSelect(
            f_regression, mode=mode, param=0.01).fit(X, y).transform(X)
        assert_array_equal(X_r, X_r2)
        support = univariate_filter.get_support()
        assert_array_equal(support[:5], np.ones((5, ), dtype=np.bool))
        assert_less(np.sum(support[5:] == 1), 3)
Example #17
0
def test_select_fpr_regression():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple regression problem
    with the fpr heuristic
    """
    X, y = make_regression(n_samples=200, n_features=20,
                           n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectFpr(f_regression, alpha=0.01)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_regression, mode='fpr',
                    param=0.01).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert(support[:5] == 1).all()
    assert(np.sum(support[5:] == 1) < 3)
print "SelectPercentile -- chi2"
print X_fitted_4.scores_
print X_fitted_4.pvalues_
print X_fitted_4.get_support()
X_transformed_4 = X_fitted_4.transform(X)
print X_transformed_4.shape

#SelectFpr --- chi2
from sklearn.feature_selection import SelectFpr
from sklearn.feature_selection import chi2

X_fitted_5 = SelectFpr(chi2, alpha=2.50017968e-15).fit(X, y)
print "SelectFpr --- chi2"
print X_fitted_5.scores_
print X_fitted_5.pvalues_
print X_fitted_5.get_support()
X_transformed_5 = X_fitted_5.transform(X)
print X_transformed_5.shape

#SelectFpr --- f_classif
from sklearn.feature_selection import SelectFpr
from sklearn.feature_selection import f_classif

X_fitted_6 = SelectFpr(f_classif, alpha=1.66966919e-31).fit(X, y)
print "SelectFpr --- f_classif"
print X_fitted_6.scores_
print X_fitted_6.pvalues_
print X_fitted_6.get_support()
X_transformed_6 = X_fitted_6.transform(X)
print X_transformed_6.shape
Example #19
0
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
imputer = imputer.fit(X)
X = imputer.transform(X)

#feature scaling
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
X_norm = mms.fit_transform(X)

# Univariate feature selection using false positive rate
from sklearn.feature_selection import SelectFpr, f_classif
X_fpr = SelectFpr(f_classif, alpha=0.05).fit(X, y)

# Get indices of selected features
X_fpr.get_support(indices=True)

# select features using false positive rate method
X_fpr = SelectFpr(f_classif, alpha=0.05).fit_transform(X, y)
print(X_fpr.shape)

# Splitting the dataset into Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_fpr,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

# fitting logistic regression to Training Set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
X_fitted_4 = SelectPercentile(chi2, percentile=50).fit(X,y)
print "SelectPercentile -- chi2"
print X_fitted_4.scores_
print X_fitted_4.pvalues_
print X_fitted_4.get_support()
X_transformed_4 = X_fitted_4.transform(X)
print X_transformed_4.shape

#SelectFpr --- chi2
from sklearn.feature_selection import SelectFpr
from sklearn.feature_selection import chi2
X_fitted_5 = SelectFpr(chi2, alpha=2.50017968e-15).fit(X,y)
print "SelectFpr --- chi2"
print X_fitted_5.scores_
print X_fitted_5.pvalues_
print X_fitted_5.get_support()
X_transformed_5 = X_fitted_5.transform(X)
print X_transformed_5.shape

#SelectFpr --- f_classif
from sklearn.feature_selection import SelectFpr
from sklearn.feature_selection import f_classif
X_fitted_6 = SelectFpr(f_classif, alpha=1.66966919e-31 ).fit(X,y)
print "SelectFpr --- f_classif"
print X_fitted_6.scores_
print X_fitted_6.pvalues_
print X_fitted_6.get_support()
X_transformed_6 = X_fitted_6.transform(X)
print X_transformed_6.shape

# SelectFdr  和 SelectFwe 的用法和上面类似,只是选择特征时候的依据不同,真正决定得分不同的是
# import data of all Count and Position features. Training and test sets altogether
dfCountfeatures = pd.read_csv('data/CountingAndPositionFeatures_TrainAndTestData.csv')
dfTrainRaw = pd.read_csv('data/train.csv')

# get only training data  
TrainQueryIDs = dfTrainRaw["id"]
relevance = dfTrainRaw["relevance"]
dfCountfeatures_TrainSet = dfCountfeatures[dfCountfeatures["id"].isin(TrainQueryIDs)]
#select these features which have non-zero variance
selector = VarianceThreshold()
selector.fit_transform(dfCountfeatures_TrainSet).shape # only one feature with zero variance - shape (74067L, 262L)

# select feature based on p-values from univariate regression with target feature (relevance)
selector2= SelectFpr(f_regression, alpha = 0.01)
selector2.fit(dfCountfeatures_TrainSet.drop("id", axis = 1), relevance)
selector2.get_support(indices=True).size # left 226 features out of 262 with p-value <=1%
# get titles of features which were selected
selectedCountfeatures = dfCountfeatures.columns[selector2.get_support(indices=True)]

# check correlation amongst features
corrReduced = dfCountfeatures_TrainSet[selectedCountfeatures].corr()
corrReduced.iloc[:,:] = np.tril(corrReduced.values, k=-1)
corrReduced =corrReduced.stack()
# get pairs of features which are highly correlated
corrReduced[corrReduced.abs()>0.8].size # 578 pairs correlated more than 80% out of 25.425
len(set(corrReduced[corrReduced.abs()>0.8].index.labels[0])) # 172 features to be removed due to high correlation with other features
# get feature titles which will be used in training the model after removing highly correlated features
indices = set(corrReduced[corrReduced.abs()>0.8].index.labels[0])
selectedCountfeatures2 = [i for j, i in enumerate(selectedCountfeatures.tolist()) if j not in indices]
selectedCountfeatures2.append("id")
        print("Loading tfidf model...")
        model_tfidf = TfidfModel.load(FLAGS.tfidfFile)

    print("Converting to tfidf vectors...")
    comments_tfidf = model_tfidf[comments_corpus]
    comments_vecs = np.vstack(
        [sparse2full(c, len(comments_dictionary)) for c in comments_tfidf])

    chi2_features = None
    if doTrain:
        # Find most descrimitive words for any of the labels
        print("Finding discrimitive features...")
        labels = np.array(data['any'])
        model_fpr = SelectFpr(chi2, alpha=0.025)
        model_fpr.fit(comments_vecs, labels)
        chi2_features = model_fpr.get_support(indices=True)
        np.save(FLAGS.chi2File, chi2_features)

    else:
        print("Loading discrimitive features data...")
        chi2_features = np.load(FLAGS.chi2File)

    print("Calculating tfidf weighted word2vec vectors...")
    chi2_tfidf_vecs = comments_vecs[:, chi2_features]
    fpr_embeddings = None
    if doTrain:
        print('Fitting FastText embedding model...')
        ft_model = FastText(sentences=docs, size=300, workers=8)
        fpr_embeddings = [
            ft_model.wv[t]
            for t in [comments_dictionary[i] for i in chi2_features]
def feature_select(x, y):
    vt = SelectFpr(f_regression, alpha=0.05)
    samples_selected = vt.fit_transform(x, y)
    get_index_selected = vt.get_support(indices=True)
    return samples_selected, get_index_selected
Example #24
0
from sklearn.feature_selection import RFE, SelectKBest, chi2, SelectFpr
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np

df = pd.read_csv('Train_CV_Data.csv')
X_train = np.asarray(df.loc[:2000000, 'srcPort':'HTTPM4'])
Y_train = np.asarray(df.loc[:2000000, 'malicious'], dtype=np.int32)
print(np.sum(Y_train == 1))

kBest = SelectKBest(chi2, k=12)
kBest.fit(X_train, Y_train)
mask1 = kBest.get_support(indices=True)

fpr = SelectFpr(chi2, alpha=0.0001)
fpr.fit(X_train, Y_train)
mask2 = fpr.get_support(indices=True)

rf = RandomForestClassifier(n_estimators=50)

rfe = RFE(rf, n_features_to_select=12, step=1)
rfe.fit(X_train, Y_train)
mask3 = rfe.get_support(indices=True)

print('K-Best Feat :', mask1)
print('False Positive based :', mask2)
print('RFE based :', mask3)