def test_select_fwe_classif(): """ Test whether the relative univariate feature selection gets the correct items in a simple classification problem with the fpr heuristic """ X, y = make_classification(n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) univariate_filter = SelectFwe(f_classif, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode='fwe', param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert (np.sum(np.abs(support - gtruth)) < 2)
def test_select_heuristics_classif(): # Test whether the relative univariate feature selection # gets the correct items in a simple classification problem # with the fdr, fwe and fpr heuristics X, y = make_classification(n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) univariate_filter = SelectFwe(f_classif, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) gtruth = np.zeros(20) gtruth[:5] = 1 for mode in ['fdr', 'fpr', 'fwe']: X_r2 = GenericUnivariateSelect(f_classif, mode=mode, param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() assert_array_almost_equal(support, gtruth)
def test_boundary_case_ch2(): # Test boundary case, and always aim to select 1 feature. X = np.array([[10, 20], [20, 20], [20, 30]]) y = np.array([[1], [0], [0]]) scores, pvalues = chi2(X, y) assert_array_almost_equal(scores, np.array([4., 0.71428571])) assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472])) filter_fdr = SelectFdr(chi2, alpha=0.1) filter_fdr.fit(X, y) support_fdr = filter_fdr.get_support() assert_array_equal(support_fdr, np.array([True, False])) filter_kbest = SelectKBest(chi2, k=1) filter_kbest.fit(X, y) support_kbest = filter_kbest.get_support() assert_array_equal(support_kbest, np.array([True, False])) filter_percentile = SelectPercentile(chi2, percentile=50) filter_percentile.fit(X, y) support_percentile = filter_percentile.get_support() assert_array_equal(support_percentile, np.array([True, False])) filter_fpr = SelectFpr(chi2, alpha=0.1) filter_fpr.fit(X, y) support_fpr = filter_fpr.get_support() assert_array_equal(support_fpr, np.array([True, False])) filter_fwe = SelectFwe(chi2, alpha=0.1) filter_fwe.fit(X, y) support_fwe = filter_fwe.get_support() assert_array_equal(support_fwe, np.array([True, False]))
def SelectFwe_selector(data, target, sf): selector = SelectFwe(score_func=sf) data_new = selector.fit_transform(data.values, target.values.ravel()) outcome = selector.get_support(True) new_features = [] # The list of your K best features for ind in outcome: new_features.append(data.columns.values[ind]) return pd.DataFrame(data_new, columns=new_features)
def test_verbose_output_for_select_select_fwe(): expected_output = ("The p-value of column 'B' (1.0000) is above the " + "specified alpha of 0.5000") model = SelectFwe(chi2, alpha=0.5) output = _capture_verbose_output_for_model(model, use_supervised_df=True) assert output == expected_output
def selectFwe(args): """Uses scikit-learn's SelectFWE, select the p-values corresponding to Family-wise error rate. Parameters ---------- score_func : callable Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues). alpha : float, optional The highest uncorrected p-value for features to keep. """ if (args[2] == "chi2"): selector = SelectFwe(chi2, alpha=float(args[1])) elif (args[2] == "f_classif"): selector = SelectFwe(f_classif, alpha=float(args[1])) return selector
def happy_pipeline(): return make_pipeline( StackingEstimator( estimator=ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_features=1.0, min_samples_leaf=1, min_samples_split=3, n_estimators=100)), StandardScaler(), SelectFwe(score_func=f_classif, alpha=0.026000000000000002), StackingEstimator(estimator=KNeighborsClassifier(n_neighbors=2, p=1, weights="distance")), XGBClassifier(learning_rate=0.001, max_depth=6, min_child_weight=12, n_estimators=100, nthread=1, subsample=0.9500000000000001) )
def test_select_fwe_float(self): model = SelectFwe() X, y = load_breast_cancer(return_X_y=True) model.fit(X, y) model_onnx = convert_sklearn( model, "select fwe", [("input", FloatTensorType([None, X.shape[1]]))], target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) dump_data_and_model( X.astype(np.float32), model, model_onnx, basename="SklearnSelectFwe")
def feature_selection(df,tgt,mtd,slct=10): '''function to do feature selection for the target specified by tgt and using the method specified by mtd''' target = df[tgt] features = df.drop([tgt], axis=1) if mtd == 'KBest': bestfeatures = SelectKBest(score_func=f_classif, k=slct) fit = bestfeatures.fit(features,target) dfscores = pd.DataFrame(fit.scores_) dfcolumns = pd.DataFrame(features.columns) #dfpvalues = pd.DataFrame(fit.pvalues_) #not providing useful insight featureScores = pd.concat([dfcolumns,dfscores],axis=1) featureScores.columns = ['Features','Score'] #naming the dataframe columns featureScores = featureScores.sort_values(by=['Score'],ascending=False) #sort to see most important features at the top select_cols = features.columns.values[fit.get_support()] #get_support returns a boolean vector indicating which features (columns' names) were selected selectfeatures = pd.DataFrame(fit.transform(features),columns = select_cols) #build dataframe with selected features only elif mtd == 'Fdr': bestfeatures = SelectFdr(score_func=f_classif, alpha=0.05) fit = bestfeatures.fit(features,target) dfscores = pd.DataFrame(fit.scores_) dfcolumns = pd.DataFrame(features.columns) #dfpvalues = pd.DataFrame(fit.pvalues_) #not providing useful insight featureScores = pd.concat([dfcolumns,dfscores],axis=1) featureScores.columns = ['Features','Score'] #naming the dataframe columns featureScores = featureScores.sort_values(by=['Score'],ascending=False) #sort to see most important features at the top select_cols = features.columns.values[fit.get_support()] #get_support returns a boolean vector indicating which features (columns' names) were selected selectfeatures = pd.DataFrame(fit.transform(features),columns = select_cols) #build dataframe with selected features only elif mtd == 'Fwe': bestfeatures = SelectFwe(score_func=f_classif, alpha=0.05) fit = bestfeatures.fit(features,target) dfscores = pd.DataFrame(fit.scores_) dfcolumns = pd.DataFrame(features.columns) #dfpvalues = pd.DataFrame(fit.pvalues_) #not providing useful insight featureScores = pd.concat([dfcolumns,dfscores],axis=1) featureScores.columns = ['Features','Score'] #naming the dataframe columns featureScores = featureScores.sort_values(by=['Score'],ascending=False) #sort to see most important features at the top select_cols = features.columns.values[fit.get_support()] #get_support returns a boolean vector indicating which features (columns' names) were selected selectfeatures = pd.DataFrame(fit.transform(features),columns = select_cols) #build dataframe with selected features only elif mtd == 'Pct': bestfeatures = SelectPercentile(score_func=f_classif, percentile=20) fit = bestfeatures.fit(features,target) dfscores = pd.DataFrame(fit.scores_) dfcolumns = pd.DataFrame(features.columns) #dfpvalues = pd.DataFrame(fit.pvalues_) #not providing useful insight featureScores = pd.concat([dfcolumns,dfscores],axis=1) featureScores.columns = ['Features','Score'] #naming the dataframe columns featureScores = featureScores.sort_values(by=['Score'],ascending=False) #sort to see most important features at the top select_cols = features.columns.values[fit.get_support()] #get_support returns a boolean vector indicating which features (columns' names) were selected selectfeatures = pd.DataFrame(fit.transform(features),columns = select_cols) #build dataframe with selected features only return select_cols #selectfeatures featureScores
def test_select_fwe_float(self): model = SelectFwe() X, y = load_breast_cancer(return_X_y=True) model.fit(X, y) model_onnx = convert_sklearn( model, 'select fwe', [('input', FloatTensorType([1, X.shape[1]]))]) self.assertTrue(model_onnx is not None) dump_data_and_model( X.astype(np.float32), model, model_onnx, basename="SklearnSelectFwe", allow_failure= "StrictVersion(onnx.__version__) < StrictVersion('1.2')")
def test_select_fwe_int(self): model = SelectFwe() X, y = load_breast_cancer(return_X_y=True) model.fit(X, y) model_onnx = convert_sklearn( model, 'select fwe', [('input', Int64TensorType([1, X.shape[1]]))]) self.assertTrue(model_onnx is not None) dump_data_and_model( X, model, model_onnx, basename="SklearnSelectFwe", allow_failure= "StrictVersion(onnxruntime.__version__) <= StrictVersion('0.1.4')")
def select_fwe(args): #https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFwe.html from sklearn.feature_selection import f_classif, chi2 if args['alpha'] is None: args['alpha'] = 0.05 if args['score_function'] == 'chi2': args['score_function'] = chi2 elif args['score_function'] == 'f_classif': args['score_function'] = f_classif return SelectFwe(score_func=args['score_function'], alpha=args['alpha'])
def test_select_fwe_4(): """Ensure that the TPOT select fwe outputs the same result as sklearn fwe when 0.001 < alpha < 0.05""" tpot_obj = TPOT() non_feature_columns = ['class', 'group', 'guess'] training_features = training_testing_data.loc[training_testing_data['group'] == 'training'].drop(non_feature_columns, axis=1) training_class_vals = training_testing_data.loc[training_testing_data['group'] == 'training', 'class'].values with warnings.catch_warnings(): warnings.simplefilter('ignore', category=UserWarning) selector = SelectFwe(f_classif, alpha=0.042) selector.fit(training_features, training_class_vals) mask = selector.get_support(True) mask_cols = list(training_features.iloc[:, mask].columns) + non_feature_columns assert np.array_equal(tpot_obj._select_fwe(training_testing_data, 0.042), training_testing_data[mask_cols])
def opt_pipe(training_features, testing_features): exported_pipeline = make_pipeline( SelectFwe(score_func=f_regression, alpha=0.017), PCA(iterated_power=5, svd_solver="randomized"), MLPRegressor(activation="tanh", alpha=100.0, learning_rate="adaptive", learning_rate_init=0.01, momentum=0.1, solver="lbfgs")) return ({ 'train_feat': training_features, 'test_feat': testing_features, 'pipe': exported_pipeline })
def test_select_fwe_int(self): model = SelectFwe() X, y = load_breast_cancer(return_X_y=True) model.fit(X, y) model_onnx = convert_sklearn( model, "select fwe", [("input", Int64TensorType([1, X.shape[1]]))]) self.assertTrue(model_onnx is not None) dump_data_and_model( X.astype(np.int64), model, model_onnx, basename="SklearnSelectFwe", allow_failure="StrictVersion(onnx.__version__)" " < StrictVersion('1.2') or " "StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.2.1')", )
def test_select_fwe_regression(): # Test whether the relative univariate feature selection # gets the correct items in a simple regression problem # with the fwe heuristic X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectFwe(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect( f_regression, mode='fwe', param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support[:5], np.ones((5, ), dtype=np.bool)) assert_less(np.sum(support[5:] == 1), 2)
def ptop_2030(X, Y, Test, uid, online=0): train_X = X.as_matrix() train_Y = Y.as_matrix() test_X = Test.as_matrix() X_train, X_test, y_train, y_test = train_test_split(train_X, train_Y, test_size=0.2, random_state=1) # Score on the training set was:-3.207903288331976 exported_pipeline = make_pipeline( SelectFwe(score_func=f_regression, alpha=0.038), StackingEstimator(estimator=LassoLarsCV(normalize=False)), StackingEstimator( estimator=GradientBoostingRegressor(alpha=0.9, learning_rate=1.0, loss="quantile", max_depth=4, max_features=0.8, min_samples_leaf=7, min_samples_split=17, n_estimators=100, subsample=0.1)), ExtraTreesRegressor(bootstrap=True, max_features=0.8500000000000001, min_samples_leaf=12, min_samples_split=10, n_estimators=100)) exported_pipeline.fit(X_train, y_train) print("train:--------------") predict = exported_pipeline.predict(X_train) error(predict, y_train) print("test:---------------") predict = exported_pipeline.predict(X_test) error(predict, y_test) # online if online == 1: exported_pipeline.fit(train_X, train_Y) predict = exported_pipeline.predict(test_X) save_to_file(predict, uid, "../result/result_12.11_1_ptot2030.csv")
def test_no_feature_selected(): rng = np.random.RandomState(0) # Generate random uncorrelated data: a strict univariate test should # rejects all the features X = rng.rand(40, 10) y = rng.randint(0, 4, size=40) strict_selectors = [ SelectFwe(alpha=0.01).fit(X, y), SelectFdr(alpha=0.01).fit(X, y), SelectFpr(alpha=0.01).fit(X, y), SelectPercentile(percentile=0).fit(X, y), SelectKBest(k=0).fit(X, y), ] for selector in strict_selectors: assert_array_equal(selector.get_support(), np.zeros(10)) X_selected = assert_warns_message( UserWarning, 'No features were selected', selector.transform, X) assert_equal(X_selected.shape, (40, 0))
def feature_sel(x, y, sel_method='estimator', k=None, estimator=None, score_func=chi2): """ :param x: :param y: :param k: :param sel_method: kbest, fdr, fpr, fwe, estimator, rfecv :param estimator: :param score_func: :return: """ if sel_method == 'kbest': assert k is not None selector = SelectKBest(score_func, k) elif sel_method == 'fdr': selector = SelectFdr(score_func, alpha=0.05) elif sel_method == 'fpr': selector = SelectFpr(score_func, alpha=0.05) elif sel_method == 'fwe': selector = SelectFwe(score_func, alpha=0.05) elif sel_method == 'estimator': assert estimator is not None if k is None: selector = SelectFromModel(estimator=estimator) else: selector = SelectFromModel(estimator=estimator, max_features=k, threshold=-np.inf) elif sel_method == 'rfecv': assert estimator is not None selector = RFECV(estimator, step=1, cv=5) else: raise Exception('unknown input parameters.') assert selector is not None x_new = selector.fit_transform(x, y) return selector.get_support(), x_new, y
def extract(train, target, score_function): global a Extract1 = SelectPercentile(score_function, percentile=50).fit( train, target).get_support(indices=1) Extract2 = SelectKBest(score_function, k=100).fit(train, target).get_support(indices=1).tolist() Extract3 = SelectFpr(score_function, alpha=0.05).fit(train, target).get_support(indices=1) Extract4 = SelectFdr(score_function, alpha=0.05).fit(train, target).get_support(indices=1) Extract5 = SelectFwe(score_function, alpha=0.05).fit(train, target).get_support(indices=1) Extract6 = SelectFromModel(RandomForestClassifier()).fit( train, target).get_support(indices=1) #a=Extract1 & Extract2 & Extract3 & Extract4 & Extract5 & Extract6 a = reduce(np.intersect1d, (Extract1, Extract2, Extract3, Extract4, Extract5, Extract6)) #a=Extract1 return a
def feature_selection_fit(df, all_params, features): """ :param df: датафрейм (уже без пропусков и выбросов и нормализованный) :param all_params: словарь all_params :param features: все фичи (по сути то же, что и numeric cols) :return: готовый датафрейм + список выбранных признаков """ selection_type = all_params['common params']['feature selection method'] # print(selection_type) if selection_type == 'VarianceThreshold': selector = VarianceThreshold(threshold=all_params['feature selection method'][selection_type]['threshold']) selector.fit(df[features]) elif selection_type == 'SelectKBest': k = all_params['feature selection method'][selection_type]['k'] if k > df[features].shape[1]: k = df[features].shape[1] selector = SelectKBest(k=k) selector.fit(df[features], df['target']) elif selection_type == 'SelectPercentile': selector = SelectPercentile(percentile=all_params['feature selection method'][selection_type]['percentile']) selector.fit(df[features], df['target']) elif selection_type == 'SelectFpr': selector = SelectFpr(alpha=all_params['feature selection method'][selection_type]['alpha']) selector.fit(df[features], df['target']) elif selection_type == 'SelectFdr': selector = SelectFdr(alpha=all_params['feature selection method'][selection_type]['alpha']) selector.fit(df[features], df['target']) elif selection_type == 'SelectFwe': selector = SelectFwe(alpha=all_params['feature selection method'][selection_type]['alpha']) selector.fit(df[features], df['target']) elif selection_type == 'GenericUnivariateSelect': mode = all_params['feature selection method'][selection_type]['mode'] param = all_params['feature selection method'][selection_type]['param'] if mode == 'k_best': # возможно, еще для каких-то вариантов mode будет значение int param = int(param) selector = GenericUnivariateSelect(mode=mode, param=param) selector.fit(df[features], df['target']) elif selection_type == 'RFE': lr = LogisticRegression() # пока в качестве estimator оставим лог.регрессию (дальше решим) selector = RFE(estimator=lr, n_features_to_select=all_params['feature selection method'][selection_type][ 'n_features_to_select'], step=all_params['feature selection method'][selection_type]['step']) selector.fit(df[features], df['target']) elif selection_type == 'SelectFromModel': lr = LogisticRegression() # пока в качестве estimator оставим лог.регрессию (дальше решим) selector = SelectFromModel(estimator=lr, threshold=all_params['feature selection method'][selection_type]['threshold'], # пока даем возможность только выбрать из списка (а не float) norm_order=all_params['feature selection method'][selection_type]['norm_order'], max_features=all_params['feature selection method'][selection_type]['max_features']) selector.fit(df[features], df['target']) with open(os.path.join(f"{MEDIA_ROOT}", 'App/models/feature_selector.pickle'), 'wb+') as f: pickle.dump(selector, f) new_cols = [x[0] for x in zip(features, selector.get_support()) if x[1] == True] # print(new_cols+['id','target']) selected = df.loc[:, new_cols + ['id', 'target']] return selected, new_cols
from sklearn.feature_selection import SelectFromModel, RFE from sklearn.ensemble import ExtraTreesClassifier from sklearn.naive_bayes import GaussianNB from sklearn.model_selection import cross_val_predict from sklearn.metrics import accuracy_score, f1_score from tpot_metrics import balanced_accuracy_score from sklearn.pipeline import make_pipeline import itertools dataset = sys.argv[1] preprocessor_list = [Binarizer(), MaxAbsScaler(), MinMaxScaler(), Normalizer(), PolynomialFeatures(), RobustScaler(), StandardScaler(), FastICA(), PCA(), RBFSampler(), Nystroem(), FeatureAgglomeration(), SelectFwe(), SelectKBest(), SelectPercentile(), VarianceThreshold(), SelectFromModel(estimator=ExtraTreesClassifier(n_estimators=100)), RFE(estimator=ExtraTreesClassifier(n_estimators=100))] # Read the data set into memory input_data = pd.read_csv(dataset, compression='gzip', sep='\t').sample(frac=1., replace=False, random_state=42) features = input_data.drop('class', axis=1).values.astype(float) labels = input_data['class'].values with warnings.catch_warnings(): warnings.simplefilter('ignore') for preprocessor in preprocessor_list: try: # Create the pipeline for the model
import numpy as np import pandas as pd from sklearn.feature_selection import SelectFwe, f_classif from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.svm import LinearSVC # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.8692307692307694 exported_pipeline = make_pipeline( SelectFwe(score_func=f_classif, alpha=0.008), LinearSVC(C=1.0, dual=False, loss="squared_hinge", penalty="l2", tol=0.1)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
import numpy as np import pandas as pd from sklearn.ensemble import RandomForestRegressor from sklearn.feature_selection import SelectFwe, f_regression from sklearn.linear_model import ElasticNetCV from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from tpot.builtins import StackingEstimator, ZeroCount # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:-2108.720316336635 exported_pipeline = make_pipeline( SelectFwe(score_func=f_regression, alpha=0.027), StackingEstimator(estimator=ElasticNetCV(l1_ratio=0.55, tol=1e-05)), ZeroCount(), RandomForestRegressor(bootstrap=False, max_features=0.05, min_samples_leaf=1, min_samples_split=3, n_estimators=100)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
def main(args): if args.train_dir is None: # args.train_dir = '/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/chap/train/' #args.train_dir = '/cs/prt3/danofer/ProtFeat/feat_extract/test_seq/NP/SPCleaved_NP-70+NEG-30_Big-V3/' # args.train_dir = r'D:\SkyDrive\Dropbox\bioInf_lab\AA_info\CODE\feat_extract\test_seq\NP\SPCleaved_NP-70+NEG-30_Big-V3' # args.train_dir = r'E:\Dropbox\Dropbox\bioInf_lab\AA_info\fastas\NP\SP_Cleaved+NP+Neg_Big' args.train_dir = r'E:\Dropbox\Dropbox\bioInf_lab\AA_info\fastas\Benchmarks\Thermophiles' print("Using default train_dir: %s" % args.train_dir) pandas.set_option('display.max_columns', 10) pandas.set_option('display.max_rows', 4) # mpl.rc('title', labelsize=6) mpl.rc('ytick', labelsize=7) mpl.rc('xtick', labelsize=4) os.chdir(args.train_dir) dataName = 'Neuropeptides' df = pandas.read_csv('trainingSetFeatures.csv') feature_cols = [ col for col in df.columns if col not in ['classname', 'Id', 'proteinname'] ] feature_cols = numpy.array(feature_cols) X = df[feature_cols].values y = df.classname.values le = LabelEncoder() y = le.fit_transform(y) "Initial feature selection trimming" print(X.shape) Fwe = SelectFwe(alpha=0.01).fit(X, y) X = Fwe.transform(X) print("F-test -> ", X.shape) feature_cols = feature_cols[Fwe.get_support()] ''' FeatSelection_SVM = True if FeatSelection_SVM == True: svc_L1 = LinearSVC(C=50, penalty="l1", dual=False,class_weight='auto').fit(X, y) X = svc_L1.transform(X, y) print ("L1 SVM Transformed X:",X_L1.shape) feature_cols=feature_cols[list(set(np.where(svc_L1.coef_ != 0)[-1]))] ''' k = SelectKBest(k=255).fit(X, y) X = k.transform(X) feature_cols = feature_cols[k.get_support()] param_dist = { "max_depth": [6, 9, None], "max_features": ['auto', 0.4], "min_samples_leaf": [1, 2, 3], "bootstrap": [True, False], 'min_samples_split': [2, 3], "criterion": ["gini"], "n_estimators": [100], "n_jobs": [-1] } rf = RandomForestClassifierWithCoef(max_depth=7, min_samples_split=1, min_samples_leaf=2, n_estimators=50, n_jobs=2, max_features="auto") "WARNING! F1 Score as implemented by Default in binary classification (two classes) gives the score for 1 class." scores = cross_validation.cross_val_score( rf, X, y, n_jobs=-1, cv=cross_validation.StratifiedShuffleSplit(y, n_iter=8, test_size=0.2)) print("X RF Accuracy: %0.3f (+- %0.2f)" % (scores.mean(), scores.std() * 2)) "Instead of scores_f1, we could also use precision, sensitivity, MCC (if binary), etc'." scores_f1 = cross_validation.cross_val_score( rf, X, y, n_jobs=-1, cv=cross_validation.StratifiedShuffleSplit(y, n_iter=8, test_size=0.2), scoring='f1') print("X RF f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2)) # rfeSelect = RFE(estimator=rf,n_features_to_select=16, step=0.04) rfeSelect = RFECV(estimator=rf, step=20, cv=2, scoring='f1') #average_precision , recall X_RFE = rfeSelect.fit_transform(X, y) print(X_RFE.shape) RFE_FeatureNames = feature_cols[rfeSelect.get_support()] print(RFE_FeatureNames) RFE_ScoreRatio = 100 * (cross_validation.cross_val_score( rf, X_RFE, y, n_jobs=-1, cv=cross_validation.StratifiedShuffleSplit(y, n_iter=8, test_size=0.2), scoring='f1').mean()) / scores_f1.mean() print( "Even with just", X_RFE.shape[1], " features, we have %f performance! (f1 score ratio)" % (RFE_ScoreRatio)) # PlotFeaturesImportance(X_RFE, y, RFE_FeatureNames, dataName) print("Alt plot:") altPlotFeaturesImportance(X_RFE, y, RFE_FeatureNames, dataName)
# X = np.array(df.drop(['label'], axis=1)) # y = np.array(df['label']) # print(len(X[0])) # selected_feats = np.array(dff[0]) # nor = Normalizer() # X = nor.fit_transform(X) # X = X[:,selected_feats] # print(selected_feats) obj1 = VarianceThreshold(threshold=(.01)) obj2 = SelectFdr(alpha=.8) obj3 = SelectFpr() obj4 = SelectKBest(k=30) obj5 = SelectFwe() feature_selection_list = [obj1, obj2, obj3, obj4, obj5] reg1 = AdaBoostRegressor reg2 = ExtraTreesRegressor reg3 = RandomForestRegressor reg4 = GradientBoostingRegressor reg_list = [reg1, reg3] last_error = 9000 est1 = DecisionTreeRegressor est2 = ExtraTreeRegressor est_list = [est1, est2] number = range(5, 100, 10)
Y, X = get_x_y(tweets_and_labels) #splitting training and test set x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=0) #Chi-Squared Analysis sel = SelectPercentile(chi2, percentile=80) sel.fit(x_train, y_train) x_train = sel.transform(x_train) x_test = sel.transform(x_test) #Univariate Feature Selection fs = SelectFwe(alpha=150.0) x_train = fs.fit_transform(x_train, y_train) x_test = fs.transform(x_test) #Classifier Fitting clf = svm.LinearSVC(C=10, penalty='l2', loss='l1', dual=True, fit_intercept=False, class_weight='auto') clf.fit(x_train, y_train) ############################################### '''Printed Data Analysis''' ###############################################
dataset = pd.read_csv('regressionDataSet.csv') x = dataset.iloc[:, 1:].values y = dataset.iloc[:, 0].values #feature selector 1 from sklearn.feature_selection import SelectKBest fs1 = SelectKBest(k=5) x_new1 = fs1.fit_transform(x, y) #feature selector 2 from sklearn.feature_selection import SelectFdr fs2 = SelectFdr() x_new2 = fs2.fit_transform(x, y) #feature selector 3 from sklearn.linear_model import LinearRegression estimator = LinearRegression() from sklearn.feature_selection import RFE fs3 = RFE(estimator, 5) x_new3 = fs3.fit_transform(x, y) #feature selector 4 from sklearn.feature_selection import SelectFromModel fs4 = SelectFromModel(estimator) x_new4 = fs4.fit_transform(x, y) #feature selector 5 from sklearn.feature_selection import SelectFwe fs5 = SelectFwe() x_new5 = fs5.fit_transform(x, y)
dataset = sys.argv[1] preprocessor_list = [ Binarizer(), MaxAbsScaler(), MinMaxScaler(), Normalizer(), PolynomialFeatures(), RobustScaler(), StandardScaler(), FastICA(), PCA(), RBFSampler(), Nystroem(), FeatureAgglomeration(), SelectFwe(), SelectKBest(), SelectPercentile(), VarianceThreshold(), SelectFromModel(estimator=ExtraTreesClassifier(n_estimators=100)), RFE(estimator=ExtraTreesClassifier(n_estimators=100)) ] # Read the data set into memory input_data = pd.read_csv(dataset, compression='gzip', sep='\t').sample(frac=1., replace=False, random_state=42) with warnings.catch_warnings(): warnings.simplefilter('ignore')
from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_selection import SelectFwe, f_classif from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from tpot.builtins import ZeroCount from tpot.export_utils import set_param_recursive # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv("data.csv") features = tpot_data.drop('PSL_Won', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['PSL_Won'], random_state=42) # Average CV score on the training set was: 0.8671594508975712 exported_pipeline = make_pipeline( SelectFwe(score_func=f_classif, alpha=0.011), ZeroCount(), ExtraTreesClassifier(bootstrap=False, criterion="entropy", max_features=1.0, min_samples_leaf=7, min_samples_split=20, n_estimators=100)) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 42) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) ## Plot from mlxtend.plotting import plot_confusion_matrix from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, mean_squared_error