def test_select_fwe_classif():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple classification problem
    with the fpr heuristic
    """
    X, y = make_classification(n_samples=200,
                               n_features=20,
                               n_informative=3,
                               n_redundant=2,
                               n_repeated=0,
                               n_classes=8,
                               n_clusters_per_class=1,
                               flip_y=0.0,
                               class_sep=10,
                               shuffle=False,
                               random_state=0)

    univariate_filter = SelectFwe(f_classif, alpha=0.01)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode='fwe',
                                   param=0.01).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert (np.sum(np.abs(support - gtruth)) < 2)
def test_select_heuristics_classif():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple classification problem
    # with the fdr, fwe and fpr heuristics
    X, y = make_classification(n_samples=200,
                               n_features=20,
                               n_informative=3,
                               n_redundant=2,
                               n_repeated=0,
                               n_classes=8,
                               n_clusters_per_class=1,
                               flip_y=0.0,
                               class_sep=10,
                               shuffle=False,
                               random_state=0)

    univariate_filter = SelectFwe(f_classif, alpha=0.01)
    X_r = univariate_filter.fit(X, y).transform(X)
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    for mode in ['fdr', 'fpr', 'fwe']:
        X_r2 = GenericUnivariateSelect(f_classif, mode=mode,
                                       param=0.01).fit(X, y).transform(X)
        assert_array_equal(X_r, X_r2)
        support = univariate_filter.get_support()
        assert_array_almost_equal(support, gtruth)
def test_boundary_case_ch2():
    # Test boundary case, and always aim to select 1 feature.
    X = np.array([[10, 20], [20, 20], [20, 30]])
    y = np.array([[1], [0], [0]])
    scores, pvalues = chi2(X, y)
    assert_array_almost_equal(scores, np.array([4., 0.71428571]))
    assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472]))

    filter_fdr = SelectFdr(chi2, alpha=0.1)
    filter_fdr.fit(X, y)
    support_fdr = filter_fdr.get_support()
    assert_array_equal(support_fdr, np.array([True, False]))

    filter_kbest = SelectKBest(chi2, k=1)
    filter_kbest.fit(X, y)
    support_kbest = filter_kbest.get_support()
    assert_array_equal(support_kbest, np.array([True, False]))

    filter_percentile = SelectPercentile(chi2, percentile=50)
    filter_percentile.fit(X, y)
    support_percentile = filter_percentile.get_support()
    assert_array_equal(support_percentile, np.array([True, False]))

    filter_fpr = SelectFpr(chi2, alpha=0.1)
    filter_fpr.fit(X, y)
    support_fpr = filter_fpr.get_support()
    assert_array_equal(support_fpr, np.array([True, False]))

    filter_fwe = SelectFwe(chi2, alpha=0.1)
    filter_fwe.fit(X, y)
    support_fwe = filter_fwe.get_support()
    assert_array_equal(support_fwe, np.array([True, False]))
Beispiel #4
0
def SelectFwe_selector(data, target, sf):
    selector = SelectFwe(score_func=sf)
    data_new = selector.fit_transform(data.values, target.values.ravel())
    outcome = selector.get_support(True)
    new_features = []  # The list of your K best features
    for ind in outcome:
        new_features.append(data.columns.values[ind])
    return pd.DataFrame(data_new, columns=new_features)
def test_verbose_output_for_select_select_fwe():
    expected_output = ("The p-value of column 'B' (1.0000) is above the " +
                       "specified alpha of 0.5000")

    model = SelectFwe(chi2, alpha=0.5)

    output = _capture_verbose_output_for_model(model, use_supervised_df=True)

    assert output == expected_output
Beispiel #6
0
def selectFwe(args):
    """Uses scikit-learn's SelectFWE, select the p-values corresponding to Family-wise error rate.
        
    Parameters
    ----------

    score_func : callable
        Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues).

    alpha : float, optional
        The highest uncorrected p-value for features to keep.

    """

    if (args[2] == "chi2"):
        selector = SelectFwe(chi2, alpha=float(args[1]))
    elif (args[2] == "f_classif"):
        selector = SelectFwe(f_classif, alpha=float(args[1]))

    return selector
Beispiel #7
0
def happy_pipeline():
    return make_pipeline(
        StackingEstimator(
            estimator=ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_features=1.0, min_samples_leaf=1,
                                           min_samples_split=3, n_estimators=100)),
        StandardScaler(),
        SelectFwe(score_func=f_classif, alpha=0.026000000000000002),
        StackingEstimator(estimator=KNeighborsClassifier(n_neighbors=2, p=1, weights="distance")),
        XGBClassifier(learning_rate=0.001, max_depth=6, min_child_weight=12, n_estimators=100, nthread=1,
                      subsample=0.9500000000000001)
    )
Beispiel #8
0
 def test_select_fwe_float(self):
     model = SelectFwe()
     X, y = load_breast_cancer(return_X_y=True)
     model.fit(X, y)
     model_onnx = convert_sklearn(
         model, "select fwe",
         [("input", FloatTensorType([None, X.shape[1]]))],
         target_opset=TARGET_OPSET)
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         X.astype(np.float32),
         model, model_onnx, basename="SklearnSelectFwe")
Beispiel #9
0
def feature_selection(df,tgt,mtd,slct=10):
	'''function to do feature selection for the target specified by tgt and using the method specified by mtd'''
	target = df[tgt]
	features = df.drop([tgt], axis=1)
	if mtd == 'KBest':
		bestfeatures = SelectKBest(score_func=f_classif, k=slct)
		fit = bestfeatures.fit(features,target)
		dfscores = pd.DataFrame(fit.scores_)
		dfcolumns = pd.DataFrame(features.columns)
		#dfpvalues = pd.DataFrame(fit.pvalues_) #not providing useful insight
		featureScores = pd.concat([dfcolumns,dfscores],axis=1)
		featureScores.columns = ['Features','Score']  #naming the dataframe columns
		featureScores = featureScores.sort_values(by=['Score'],ascending=False) #sort to see most important features at the top
		select_cols = features.columns.values[fit.get_support()] #get_support returns a boolean vector indicating which features (columns' names) were selected
		selectfeatures = pd.DataFrame(fit.transform(features),columns = select_cols) #build dataframe with selected features only
	elif mtd == 'Fdr':
		bestfeatures = SelectFdr(score_func=f_classif, alpha=0.05)
		fit = bestfeatures.fit(features,target)
		dfscores = pd.DataFrame(fit.scores_)
		dfcolumns = pd.DataFrame(features.columns)
		#dfpvalues = pd.DataFrame(fit.pvalues_) #not providing useful insight
		featureScores = pd.concat([dfcolumns,dfscores],axis=1)
		featureScores.columns = ['Features','Score']  #naming the dataframe columns
		featureScores = featureScores.sort_values(by=['Score'],ascending=False) #sort to see most important features at the top
		select_cols = features.columns.values[fit.get_support()] #get_support returns a boolean vector indicating which features (columns' names) were selected
		selectfeatures = pd.DataFrame(fit.transform(features),columns = select_cols) #build dataframe with selected features only
	elif mtd == 'Fwe':
		bestfeatures = SelectFwe(score_func=f_classif, alpha=0.05)
		fit = bestfeatures.fit(features,target)
		dfscores = pd.DataFrame(fit.scores_)
		dfcolumns = pd.DataFrame(features.columns)
		#dfpvalues = pd.DataFrame(fit.pvalues_) #not providing useful insight
		featureScores = pd.concat([dfcolumns,dfscores],axis=1)
		featureScores.columns = ['Features','Score']  #naming the dataframe columns
		featureScores = featureScores.sort_values(by=['Score'],ascending=False) #sort to see most important features at the top
		select_cols = features.columns.values[fit.get_support()] #get_support returns a boolean vector indicating which features (columns' names) were selected
		selectfeatures = pd.DataFrame(fit.transform(features),columns = select_cols) #build dataframe with selected features only
	elif mtd == 'Pct':
		bestfeatures = SelectPercentile(score_func=f_classif, percentile=20)
		fit = bestfeatures.fit(features,target)
		dfscores = pd.DataFrame(fit.scores_)
		dfcolumns = pd.DataFrame(features.columns)
		#dfpvalues = pd.DataFrame(fit.pvalues_) #not providing useful insight
		featureScores = pd.concat([dfcolumns,dfscores],axis=1)
		featureScores.columns = ['Features','Score']  #naming the dataframe columns
		featureScores = featureScores.sort_values(by=['Score'],ascending=False) #sort to see most important features at the top
		select_cols = features.columns.values[fit.get_support()] #get_support returns a boolean vector indicating which features (columns' names) were selected
		selectfeatures = pd.DataFrame(fit.transform(features),columns = select_cols) #build dataframe with selected features only

	return select_cols #selectfeatures featureScores
 def test_select_fwe_float(self):
     model = SelectFwe()
     X, y = load_breast_cancer(return_X_y=True)
     model.fit(X, y)
     model_onnx = convert_sklearn(
         model, 'select fwe', [('input', FloatTensorType([1, X.shape[1]]))])
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         X.astype(np.float32),
         model,
         model_onnx,
         basename="SklearnSelectFwe",
         allow_failure=
         "StrictVersion(onnx.__version__) < StrictVersion('1.2')")
Beispiel #11
0
 def test_select_fwe_int(self):
     model = SelectFwe()
     X, y = load_breast_cancer(return_X_y=True)
     model.fit(X, y)
     model_onnx = convert_sklearn(
         model, 'select fwe', [('input', Int64TensorType([1, X.shape[1]]))])
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         X,
         model,
         model_onnx,
         basename="SklearnSelectFwe",
         allow_failure=
         "StrictVersion(onnxruntime.__version__) <= StrictVersion('0.1.4')")
Beispiel #12
0
def select_fwe(args):
    #https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFwe.html

    from sklearn.feature_selection import f_classif, chi2

    if args['alpha'] is None:
        args['alpha'] = 0.05

    if args['score_function'] == 'chi2':
        args['score_function'] = chi2
    elif args['score_function'] == 'f_classif':
        args['score_function'] = f_classif

    return SelectFwe(score_func=args['score_function'], alpha=args['alpha'])
Beispiel #13
0
def test_select_fwe_4():
    """Ensure that the TPOT select fwe outputs the same result as sklearn fwe when 0.001 < alpha < 0.05"""
    tpot_obj = TPOT()
    non_feature_columns = ['class', 'group', 'guess']
    training_features = training_testing_data.loc[training_testing_data['group'] == 'training'].drop(non_feature_columns, axis=1)
    training_class_vals = training_testing_data.loc[training_testing_data['group'] == 'training', 'class'].values

    with warnings.catch_warnings():
        warnings.simplefilter('ignore', category=UserWarning)
        selector = SelectFwe(f_classif, alpha=0.042)
        selector.fit(training_features, training_class_vals)
        mask = selector.get_support(True)
    mask_cols = list(training_features.iloc[:, mask].columns) + non_feature_columns

    assert np.array_equal(tpot_obj._select_fwe(training_testing_data, 0.042), training_testing_data[mask_cols])
Beispiel #14
0
def opt_pipe(training_features, testing_features):
    exported_pipeline = make_pipeline(
        SelectFwe(score_func=f_regression, alpha=0.017),
        PCA(iterated_power=5, svd_solver="randomized"),
        MLPRegressor(activation="tanh",
                     alpha=100.0,
                     learning_rate="adaptive",
                     learning_rate_init=0.01,
                     momentum=0.1,
                     solver="lbfgs"))
    return ({
        'train_feat': training_features,
        'test_feat': testing_features,
        'pipe': exported_pipeline
    })
 def test_select_fwe_int(self):
     model = SelectFwe()
     X, y = load_breast_cancer(return_X_y=True)
     model.fit(X, y)
     model_onnx = convert_sklearn(
         model, "select fwe", [("input", Int64TensorType([1, X.shape[1]]))])
     self.assertTrue(model_onnx is not None)
     dump_data_and_model(
         X.astype(np.int64),
         model,
         model_onnx,
         basename="SklearnSelectFwe",
         allow_failure="StrictVersion(onnx.__version__)"
         " < StrictVersion('1.2') or "
         "StrictVersion(onnxruntime.__version__)"
         " <= StrictVersion('0.2.1')",
     )
Beispiel #16
0
def test_select_fwe_regression():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple regression problem
    # with the fwe heuristic
    X, y = make_regression(n_samples=200, n_features=20,
                           n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectFwe(f_regression, alpha=0.01)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(
        f_regression, mode='fwe', param=0.01).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support[:5], np.ones((5, ), dtype=np.bool))
    assert_less(np.sum(support[5:] == 1), 2)
Beispiel #17
0
def ptop_2030(X, Y, Test, uid, online=0):
    train_X = X.as_matrix()
    train_Y = Y.as_matrix()

    test_X = Test.as_matrix()

    X_train, X_test, y_train, y_test = train_test_split(train_X,
                                                        train_Y,
                                                        test_size=0.2,
                                                        random_state=1)

    # Score on the training set was:-3.207903288331976
    exported_pipeline = make_pipeline(
        SelectFwe(score_func=f_regression, alpha=0.038),
        StackingEstimator(estimator=LassoLarsCV(normalize=False)),
        StackingEstimator(
            estimator=GradientBoostingRegressor(alpha=0.9,
                                                learning_rate=1.0,
                                                loss="quantile",
                                                max_depth=4,
                                                max_features=0.8,
                                                min_samples_leaf=7,
                                                min_samples_split=17,
                                                n_estimators=100,
                                                subsample=0.1)),
        ExtraTreesRegressor(bootstrap=True,
                            max_features=0.8500000000000001,
                            min_samples_leaf=12,
                            min_samples_split=10,
                            n_estimators=100))

    exported_pipeline.fit(X_train, y_train)
    print("train:--------------")
    predict = exported_pipeline.predict(X_train)
    error(predict, y_train)

    print("test:---------------")
    predict = exported_pipeline.predict(X_test)
    error(predict, y_test)

    # online
    if online == 1:
        exported_pipeline.fit(train_X, train_Y)
        predict = exported_pipeline.predict(test_X)
        save_to_file(predict, uid, "../result/result_12.11_1_ptot2030.csv")
Beispiel #18
0
def test_no_feature_selected():
    rng = np.random.RandomState(0)

    # Generate random uncorrelated data: a strict univariate test should
    # rejects all the features
    X = rng.rand(40, 10)
    y = rng.randint(0, 4, size=40)
    strict_selectors = [
        SelectFwe(alpha=0.01).fit(X, y),
        SelectFdr(alpha=0.01).fit(X, y),
        SelectFpr(alpha=0.01).fit(X, y),
        SelectPercentile(percentile=0).fit(X, y),
        SelectKBest(k=0).fit(X, y),
    ]
    for selector in strict_selectors:
        assert_array_equal(selector.get_support(), np.zeros(10))
        X_selected = assert_warns_message(
            UserWarning, 'No features were selected', selector.transform, X)
        assert_equal(X_selected.shape, (40, 0))
Beispiel #19
0
def feature_sel(x,
                y,
                sel_method='estimator',
                k=None,
                estimator=None,
                score_func=chi2):
    """

    :param x:
    :param y:
    :param k:
    :param sel_method: kbest, fdr, fpr, fwe, estimator, rfecv
    :param estimator:
    :param score_func:
    :return:
    """

    if sel_method == 'kbest':
        assert k is not None
        selector = SelectKBest(score_func, k)
    elif sel_method == 'fdr':
        selector = SelectFdr(score_func, alpha=0.05)
    elif sel_method == 'fpr':
        selector = SelectFpr(score_func, alpha=0.05)
    elif sel_method == 'fwe':
        selector = SelectFwe(score_func, alpha=0.05)
    elif sel_method == 'estimator':
        assert estimator is not None
        if k is None:
            selector = SelectFromModel(estimator=estimator)
        else:
            selector = SelectFromModel(estimator=estimator,
                                       max_features=k,
                                       threshold=-np.inf)
    elif sel_method == 'rfecv':
        assert estimator is not None
        selector = RFECV(estimator, step=1, cv=5)
    else:
        raise Exception('unknown input parameters.')

    assert selector is not None
    x_new = selector.fit_transform(x, y)
    return selector.get_support(), x_new, y
Beispiel #20
0
def extract(train, target, score_function):
    global a
    Extract1 = SelectPercentile(score_function, percentile=50).fit(
        train, target).get_support(indices=1)
    Extract2 = SelectKBest(score_function,
                           k=100).fit(train,
                                      target).get_support(indices=1).tolist()
    Extract3 = SelectFpr(score_function,
                         alpha=0.05).fit(train, target).get_support(indices=1)
    Extract4 = SelectFdr(score_function,
                         alpha=0.05).fit(train, target).get_support(indices=1)
    Extract5 = SelectFwe(score_function,
                         alpha=0.05).fit(train, target).get_support(indices=1)
    Extract6 = SelectFromModel(RandomForestClassifier()).fit(
        train, target).get_support(indices=1)
    #a=Extract1 & Extract2 & Extract3 & Extract4 & Extract5 & Extract6
    a = reduce(np.intersect1d,
               (Extract1, Extract2, Extract3, Extract4, Extract5, Extract6))
    #a=Extract1
    return a
def feature_selection_fit(df, all_params, features):
    """

    :param df: датафрейм (уже без пропусков и выбросов и нормализованный)
    :param all_params: словарь all_params
    :param features: все фичи (по сути то же, что и numeric cols)
    :return: готовый датафрейм + список выбранных признаков
    """
    selection_type = all_params['common params']['feature selection method']
    #     print(selection_type)

    if selection_type == 'VarianceThreshold':
        selector = VarianceThreshold(threshold=all_params['feature selection method'][selection_type]['threshold'])
        selector.fit(df[features])

    elif selection_type == 'SelectKBest':
        k = all_params['feature selection method'][selection_type]['k']
        if k > df[features].shape[1]:
            k = df[features].shape[1]
        selector = SelectKBest(k=k)
        selector.fit(df[features], df['target'])

    elif selection_type == 'SelectPercentile':
        selector = SelectPercentile(percentile=all_params['feature selection method'][selection_type]['percentile'])
        selector.fit(df[features], df['target'])

    elif selection_type == 'SelectFpr':
        selector = SelectFpr(alpha=all_params['feature selection method'][selection_type]['alpha'])
        selector.fit(df[features], df['target'])

    elif selection_type == 'SelectFdr':
        selector = SelectFdr(alpha=all_params['feature selection method'][selection_type]['alpha'])
        selector.fit(df[features], df['target'])

    elif selection_type == 'SelectFwe':
        selector = SelectFwe(alpha=all_params['feature selection method'][selection_type]['alpha'])
        selector.fit(df[features], df['target'])

    elif selection_type == 'GenericUnivariateSelect':
        mode = all_params['feature selection method'][selection_type]['mode']
        param = all_params['feature selection method'][selection_type]['param']

        if mode == 'k_best':  # возможно, еще для каких-то вариантов mode будет значение int
            param = int(param)

        selector = GenericUnivariateSelect(mode=mode, param=param)
        selector.fit(df[features], df['target'])

    elif selection_type == 'RFE':
        lr = LogisticRegression()  # пока в качестве estimator оставим лог.регрессию (дальше решим)
        selector = RFE(estimator=lr,
                       n_features_to_select=all_params['feature selection method'][selection_type][
                           'n_features_to_select'],
                       step=all_params['feature selection method'][selection_type]['step'])
        selector.fit(df[features], df['target'])

    elif selection_type == 'SelectFromModel':
        lr = LogisticRegression()  # пока в качестве estimator оставим лог.регрессию (дальше решим)
        selector = SelectFromModel(estimator=lr,
                                   threshold=all_params['feature selection method'][selection_type]['threshold'],
                                   # пока даем возможность только выбрать из списка (а не float)
                                   norm_order=all_params['feature selection method'][selection_type]['norm_order'],
                                   max_features=all_params['feature selection method'][selection_type]['max_features'])
        selector.fit(df[features], df['target'])

    with open(os.path.join(f"{MEDIA_ROOT}",  'App/models/feature_selector.pickle'), 'wb+') as f:
        pickle.dump(selector, f)

    new_cols = [x[0] for x in zip(features, selector.get_support()) if x[1] == True]
    #     print(new_cols+['id','target'])

    selected = df.loc[:, new_cols + ['id', 'target']]

    return selected, new_cols
Beispiel #22
0
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.ensemble import ExtraTreesClassifier

from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, f1_score
from tpot_metrics import balanced_accuracy_score
from sklearn.pipeline import make_pipeline
import itertools

dataset = sys.argv[1]

preprocessor_list = [Binarizer(), MaxAbsScaler(), MinMaxScaler(), Normalizer(),
                     PolynomialFeatures(), RobustScaler(), StandardScaler(),
                     FastICA(), PCA(), RBFSampler(), Nystroem(), FeatureAgglomeration(),
                     SelectFwe(), SelectKBest(), SelectPercentile(), VarianceThreshold(),
                     SelectFromModel(estimator=ExtraTreesClassifier(n_estimators=100)),
                     RFE(estimator=ExtraTreesClassifier(n_estimators=100))]

# Read the data set into memory
input_data = pd.read_csv(dataset, compression='gzip', sep='\t').sample(frac=1., replace=False, random_state=42)

features = input_data.drop('class', axis=1).values.astype(float)
labels = input_data['class'].values

with warnings.catch_warnings():
    warnings.simplefilter('ignore')

    for preprocessor in preprocessor_list:
        try:
            # Create the pipeline for the model
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectFwe, f_classif
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.8692307692307694
exported_pipeline = make_pipeline(
    SelectFwe(score_func=f_classif, alpha=0.008),
    LinearSVC(C=1.0, dual=False, loss="squared_hinge", penalty="l2", tol=0.1))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFwe, f_regression
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator, ZeroCount

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:-2108.720316336635
exported_pipeline = make_pipeline(
    SelectFwe(score_func=f_regression, alpha=0.027),
    StackingEstimator(estimator=ElasticNetCV(l1_ratio=0.55, tol=1e-05)),
    ZeroCount(),
    RandomForestRegressor(bootstrap=False,
                          max_features=0.05,
                          min_samples_leaf=1,
                          min_samples_split=3,
                          n_estimators=100))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Beispiel #25
0
def main(args):
    if args.train_dir is None:
        # args.train_dir = '/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/chap/train/'
        #args.train_dir = '/cs/prt3/danofer/ProtFeat/feat_extract/test_seq/NP/SPCleaved_NP-70+NEG-30_Big-V3/'
        #        args.train_dir =  r'D:\SkyDrive\Dropbox\bioInf_lab\AA_info\CODE\feat_extract\test_seq\NP\SPCleaved_NP-70+NEG-30_Big-V3'
        # args.train_dir =  r'E:\Dropbox\Dropbox\bioInf_lab\AA_info\fastas\NP\SP_Cleaved+NP+Neg_Big'
        args.train_dir = r'E:\Dropbox\Dropbox\bioInf_lab\AA_info\fastas\Benchmarks\Thermophiles'
        print("Using default train_dir: %s" % args.train_dir)

    pandas.set_option('display.max_columns', 10)
    pandas.set_option('display.max_rows', 4)
    # mpl.rc('title', labelsize=6)
    mpl.rc('ytick', labelsize=7)
    mpl.rc('xtick', labelsize=4)

    os.chdir(args.train_dir)
    dataName = 'Neuropeptides'

    df = pandas.read_csv('trainingSetFeatures.csv')
    feature_cols = [
        col for col in df.columns
        if col not in ['classname', 'Id', 'proteinname']
    ]
    feature_cols = numpy.array(feature_cols)

    X = df[feature_cols].values
    y = df.classname.values

    le = LabelEncoder()
    y = le.fit_transform(y)

    "Initial feature selection trimming"
    print(X.shape)

    Fwe = SelectFwe(alpha=0.01).fit(X, y)
    X = Fwe.transform(X)
    print("F-test -> ", X.shape)
    feature_cols = feature_cols[Fwe.get_support()]
    '''
    FeatSelection_SVM = True
    if FeatSelection_SVM == True:
        svc_L1 = LinearSVC(C=50, penalty="l1", dual=False,class_weight='auto').fit(X, y)
        X = svc_L1.transform(X, y)
        print ("L1 SVM Transformed X:",X_L1.shape)
        feature_cols=feature_cols[list(set(np.where(svc_L1.coef_ != 0)[-1]))]
    '''

    k = SelectKBest(k=255).fit(X, y)
    X = k.transform(X)
    feature_cols = feature_cols[k.get_support()]

    param_dist = {
        "max_depth": [6, 9, None],
        "max_features": ['auto', 0.4],
        "min_samples_leaf": [1, 2, 3],
        "bootstrap": [True, False],
        'min_samples_split': [2, 3],
        "criterion": ["gini"],
        "n_estimators": [100],
        "n_jobs": [-1]
    }

    rf = RandomForestClassifierWithCoef(max_depth=7,
                                        min_samples_split=1,
                                        min_samples_leaf=2,
                                        n_estimators=50,
                                        n_jobs=2,
                                        max_features="auto")

    "WARNING! F1 Score as implemented by Default in binary classification (two classes) gives the score for 1 class."

    scores = cross_validation.cross_val_score(
        rf,
        X,
        y,
        n_jobs=-1,
        cv=cross_validation.StratifiedShuffleSplit(y, n_iter=8, test_size=0.2))
    print("X RF Accuracy: %0.3f (+- %0.2f)" %
          (scores.mean(), scores.std() * 2))
    "Instead of scores_f1, we could also use precision, sensitivity, MCC (if binary), etc'."
    scores_f1 = cross_validation.cross_val_score(
        rf,
        X,
        y,
        n_jobs=-1,
        cv=cross_validation.StratifiedShuffleSplit(y, n_iter=8, test_size=0.2),
        scoring='f1')
    print("X RF f1: %0.3f (+- %0.2f)" %
          (scores_f1.mean(), scores_f1.std() * 2))

    # rfeSelect = RFE(estimator=rf,n_features_to_select=16, step=0.04)
    rfeSelect = RFECV(estimator=rf, step=20, cv=2,
                      scoring='f1')  #average_precision , recall
    X_RFE = rfeSelect.fit_transform(X, y)
    print(X_RFE.shape)

    RFE_FeatureNames = feature_cols[rfeSelect.get_support()]
    print(RFE_FeatureNames)

    RFE_ScoreRatio = 100 * (cross_validation.cross_val_score(
        rf,
        X_RFE,
        y,
        n_jobs=-1,
        cv=cross_validation.StratifiedShuffleSplit(y, n_iter=8, test_size=0.2),
        scoring='f1').mean()) / scores_f1.mean()
    print(
        "Even with just", X_RFE.shape[1],
        " features, we have %f performance! (f1 score ratio)" %
        (RFE_ScoreRatio))

    # PlotFeaturesImportance(X_RFE, y, RFE_FeatureNames, dataName)
    print("Alt plot:")
    altPlotFeaturesImportance(X_RFE, y, RFE_FeatureNames, dataName)
# X = np.array(df.drop(['label'], axis=1))
# y = np.array(df['label'])

# print(len(X[0]))

# selected_feats = np.array(dff[0])
# nor = Normalizer()
# X = nor.fit_transform(X)
# X = X[:,selected_feats]
# print(selected_feats)

obj1 = VarianceThreshold(threshold=(.01))
obj2 = SelectFdr(alpha=.8)
obj3 = SelectFpr()
obj4 = SelectKBest(k=30)
obj5 = SelectFwe()
feature_selection_list = [obj1, obj2, obj3, obj4, obj5]

reg1 = AdaBoostRegressor
reg2 = ExtraTreesRegressor
reg3 = RandomForestRegressor
reg4 = GradientBoostingRegressor
reg_list = [reg1, reg3]

last_error = 9000

est1 = DecisionTreeRegressor
est2 = ExtraTreeRegressor
est_list = [est1, est2]

number = range(5, 100, 10)
Beispiel #27
0
Y, X = get_x_y(tweets_and_labels)

#splitting training and test set
x_train, x_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.20,
                                                    random_state=0)

#Chi-Squared Analysis
sel = SelectPercentile(chi2, percentile=80)
sel.fit(x_train, y_train)
x_train = sel.transform(x_train)
x_test = sel.transform(x_test)

#Univariate Feature Selection
fs = SelectFwe(alpha=150.0)
x_train = fs.fit_transform(x_train, y_train)
x_test = fs.transform(x_test)

#Classifier Fitting
clf = svm.LinearSVC(C=10,
                    penalty='l2',
                    loss='l1',
                    dual=True,
                    fit_intercept=False,
                    class_weight='auto')
clf.fit(x_train, y_train)

###############################################
'''Printed Data Analysis'''
###############################################
Beispiel #28
0
dataset = pd.read_csv('regressionDataSet.csv')
x = dataset.iloc[:, 1:].values
y = dataset.iloc[:, 0].values

#feature selector 1
from sklearn.feature_selection import SelectKBest
fs1 = SelectKBest(k=5)
x_new1 = fs1.fit_transform(x, y)

#feature selector 2
from sklearn.feature_selection import SelectFdr
fs2 = SelectFdr()
x_new2 = fs2.fit_transform(x, y)

#feature selector 3
from sklearn.linear_model import LinearRegression
estimator = LinearRegression()
from sklearn.feature_selection import RFE
fs3 = RFE(estimator, 5)
x_new3 = fs3.fit_transform(x, y)

#feature selector 4
from sklearn.feature_selection import SelectFromModel
fs4 = SelectFromModel(estimator)
x_new4 = fs4.fit_transform(x, y)

#feature selector 5
from sklearn.feature_selection import SelectFwe
fs5 = SelectFwe()
x_new5 = fs5.fit_transform(x, y)
Beispiel #29
0
dataset = sys.argv[1]

preprocessor_list = [
    Binarizer(),
    MaxAbsScaler(),
    MinMaxScaler(),
    Normalizer(),
    PolynomialFeatures(),
    RobustScaler(),
    StandardScaler(),
    FastICA(),
    PCA(),
    RBFSampler(),
    Nystroem(),
    FeatureAgglomeration(),
    SelectFwe(),
    SelectKBest(),
    SelectPercentile(),
    VarianceThreshold(),
    SelectFromModel(estimator=ExtraTreesClassifier(n_estimators=100)),
    RFE(estimator=ExtraTreesClassifier(n_estimators=100))
]

# Read the data set into memory
input_data = pd.read_csv(dataset, compression='gzip',
                         sep='\t').sample(frac=1.,
                                          replace=False,
                                          random_state=42)

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
Beispiel #30
0
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFwe, f_classif
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from tpot.builtins import ZeroCount
from tpot.export_utils import set_param_recursive

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv("data.csv")
features = tpot_data.drop('PSL_Won', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['PSL_Won'], random_state=42)

# Average CV score on the training set was: 0.8671594508975712
exported_pipeline = make_pipeline(
    SelectFwe(score_func=f_classif, alpha=0.011), ZeroCount(),
    ExtraTreesClassifier(bootstrap=False,
                         criterion="entropy",
                         max_features=1.0,
                         min_samples_leaf=7,
                         min_samples_split=20,
                         n_estimators=100))
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

## Plot
from mlxtend.plotting import plot_confusion_matrix
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, mean_squared_error