def featureFitting(filename, X, y, featureNames, optimalFlag, kbest=20, alpha=0.05, model=None): ''' Gets the K-best features (filtered by FDR, then select best ranked by t-test, more advanced options can be implemented). Save the data/matrix with the resulting/kept features to a new output file, "REDUCED_Feat.csv" Returns new features matrix, FD scaler, and K-select scaler ''' a = alpha FD = SelectFdr(alpha=a) X = FD.fit_transform(X, y) selectK = SelectKBest(k=kbest) selectK.fit(X, y) selectK_mask = selectK.get_support() K_featnames = featureNames[selectK_mask] print("K_featnames: %s" % (K_featnames)) Reduced_df = pd.read_csv(filename, index_col=0) Reduced_df = Reduced_df[Reduced_df.columns[selectK_mask]] Reduced_df.to_csv('REDUCED_Feat.csv') return Reduced_df, FD, selectK
def single_fdr(alpha, n_informative, random_state): X, y = make_regression(n_samples=150, n_features=20, n_informative=n_informative, shuffle=False, random_state=random_state, noise=10) with warnings.catch_warnings(record=True): # Warnings can be raised when no features are selected # (low alpha or very noisy data) univariate_filter = SelectFdr(f_regression, alpha=alpha) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode='fdr', param=alpha).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() num_false_positives = np.sum(support[n_informative:] == 1) num_true_positives = np.sum(support[:n_informative] == 1) if num_false_positives == 0: return 0. false_discovery_rate = (num_false_positives / (num_true_positives + num_false_positives)) return false_discovery_rate
class f_regressionFDRPrim(primitive): def __init__(self, random_state=0): super(f_regressionFDRPrim, self).__init__(name='f_regressionFDR') self.id = 34 self.PCA_LAPACK_Prim = [] self.type = 'feature selection' self.description = "Filter: Select the p-values for an estimated false discovery rate with F-value between label/feature for regression tasks. This uses the Benjamini-Hochberg procedure. alpha is an upper bound on the expected false discovery rate." self.hyperparams_run = {'default': True} self.selector = None self.accept_type = 'c_r' def can_accept(self, data): return self.can_accept_c(data, 'Regression') def is_needed(self, data): if data['X'].shape[1] < 3: return False return True def fit(self, data): data = handle_data(data) self.selector = SelectFdr(f_regression) self.selector.fit(data['X'], data['Y']) def produce(self, data): output = handle_data(data) cols = list(output['X'].columns) mask = self.selector.get_support(indices=False) final_cols = list(compress(cols, mask)) output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols) final_output = {0: output} return final_output
class UnivariateSelectChiFDRPrim(primitive): def __init__(self, random_state=0): super(UnivariateSelectChiFDRPrim, self).__init__(name='UnivariateSelectChiFDR') self.id = 31 self.PCA_LAPACK_Prim = [] self.type = 'feature selection' self.description = "Filter: Select the p-values for an estimated false discovery rate with Chi-square. This uses the Benjamini-Hochberg procedure. alpha is an upper bound on the expected false discovery rate." self.hyperparams_run = {'default': True} self.selector = None self.accept_type = 'd' def can_accept(self, data): return self.can_accept_d(data, 'Classification') def is_needed(self, data): if data['X'].shape[1] < 3: return False return True def fit(self, data): data = handle_data(data) self.selector = SelectFdr(chi2, alpha=0.05) self.selector.fit(data['X'], data['Y']) def produce(self, data): output = handle_data(data) cols = list(output['X'].columns) try: mask = self.selector.get_support(indices=False) final_cols = list(compress(cols, mask)) output['X'] = pd.DataFrame(self.selector.transform(output['X']), columns=final_cols) except Exception as e: print(e) final_output = {0: output} return final_output
def test_boundary_case_ch2(): # Test boundary case, and always aim to select 1 feature. X = np.array([[10, 20], [20, 20], [20, 30]]) y = np.array([[1], [0], [0]]) scores, pvalues = chi2(X, y) assert_array_almost_equal(scores, np.array([4., 0.71428571])) assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472])) filter_fdr = SelectFdr(chi2, alpha=0.1) filter_fdr.fit(X, y) support_fdr = filter_fdr.get_support() assert_array_equal(support_fdr, np.array([True, False])) filter_kbest = SelectKBest(chi2, k=1) filter_kbest.fit(X, y) support_kbest = filter_kbest.get_support() assert_array_equal(support_kbest, np.array([True, False])) filter_percentile = SelectPercentile(chi2, percentile=50) filter_percentile.fit(X, y) support_percentile = filter_percentile.get_support() assert_array_equal(support_percentile, np.array([True, False])) filter_fpr = SelectFpr(chi2, alpha=0.1) filter_fpr.fit(X, y) support_fpr = filter_fpr.get_support() assert_array_equal(support_fpr, np.array([True, False])) filter_fwe = SelectFwe(chi2, alpha=0.1) filter_fwe.fit(X, y) support_fwe = filter_fwe.get_support() assert_array_equal(support_fwe, np.array([True, False]))
def test_select_fdr_classif(): """ Test whether the relative univariate feature selection gets the correct items in a simple classification problem with the fpr heuristic """ X, Y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) univariate_filter = SelectFdr(f_classif, alpha=0.0001) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode="fdr", param=0.0001).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def single_fdr(alpha, n_informative, random_state): X, y = make_regression( n_samples=150, n_features=20, n_informative=n_informative, shuffle=False, random_state=random_state, noise=10, ) with warnings.catch_warnings(record=True): # Warnings can be raised when no features are selected # (low alpha or very noisy data) univariate_filter = SelectFdr(f_regression, alpha=alpha) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode="fdr", param=alpha).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() num_false_positives = np.sum(support[n_informative:] == 1) num_true_positives = np.sum(support[:n_informative] == 1) if num_false_positives == 0: return 0.0 false_discovery_rate = num_false_positives / (num_true_positives + num_false_positives) return false_discovery_rate
def test_boundary_case_ch2(): # Test boundary case, and always aim to select 1 feature. X = np.array([[10, 20], [20, 20], [20, 30]]) y = np.array([[1], [0], [0]]) scores, pvalues = chi2(X, y) assert_array_almost_equal(scores, np.array([4.0, 0.71428571])) assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472])) filter_fdr = SelectFdr(chi2, alpha=0.1) filter_fdr.fit(X, y) support_fdr = filter_fdr.get_support() assert_array_equal(support_fdr, np.array([True, False])) filter_kbest = SelectKBest(chi2, k=1) filter_kbest.fit(X, y) support_kbest = filter_kbest.get_support() assert_array_equal(support_kbest, np.array([True, False])) filter_percentile = SelectPercentile(chi2, percentile=50) filter_percentile.fit(X, y) support_percentile = filter_percentile.get_support() assert_array_equal(support_percentile, np.array([True, False])) filter_fpr = SelectFpr(chi2, alpha=0.1) filter_fpr.fit(X, y) support_fpr = filter_fpr.get_support() assert_array_equal(support_fpr, np.array([True, False])) filter_fwe = SelectFwe(chi2, alpha=0.1) filter_fwe.fit(X, y) support_fwe = filter_fwe.get_support() assert_array_equal(support_fwe, np.array([True, False]))
def gene_univariate_feature_selection(self, alpha=0.01): gene_normal_X, gene_normal_Y = self.make_dataset( dataset='gene', normal_tumor='normal', normal_matched=True, mirna_gene_matched=True) gene_tumor_X, gene_tumor_Y = self.make_dataset(dataset='gene', normal_tumor='tumor', normal_matched=True, mirna_gene_matched=True) gene_exp_filter = SelectFdr(f_classif, alpha=alpha) gen_exp_new = gene_exp_filter.fit_transform( X=pandas.concat([gene_normal_X, gene_tumor_X]), y=pandas.concat([gene_normal_Y, gene_tumor_Y])) self.gene_symbols = np.asanyarray( self.gene_symbols)[gene_exp_filter.get_support( indices=True)].tolist() self.gene_tumor = self.gene_tumor[ self.gene_symbols + ['patient_barcode', 'pathologic_stage', 'histological_type']] self.gene_normal = self.gene_normal[ self.gene_symbols + ['patient_barcode', 'pathologic_stage', 'histological_type']]
def test_select_fdr_classif(): """ Test whether the relative univariate feature selection gets the correct items in a simple classification problem with the fpr heuristic """ X, y = make_classification(n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) univariate_filter = SelectFdr(f_classif, alpha=0.0001) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode='fdr', param=0.0001).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def test_pipeline(): pipeline = dl.Pipeline([("scale", StandardScaler()), ("fdr", SelectFdr()), ("svm", LinearSVC())]) pipeline = pipeline.fit(X, y) y2 = pipeline.predict(X) score = pipeline.score(X, y) assert isinstance(y2, di.Value) assert isinstance(score, di.Value) assert isinstance(score.compute(), float) assert pipeline.score(X, y).key == pipeline.score(X, y).key assert score.compute() == score.compute() y22 = y2.compute() assert y22.shape == y.shape assert y22.dtype == y.dtype skpipeline = sklearn.pipeline.Pipeline([("scale", StandardScaler()), ("fdr", SelectFdr()), ("svm", LinearSVC())]) skpipeline.fit(X, y) sk_y2 = skpipeline.predict(X) sk_score = skpipeline.score(X, y) assert sk_score == score.compute()
def SelectFdr_selector(data, target, sf): selector = SelectFdr(score_func=sf) data_new = selector.fit_transform(data.values, target.values.ravel()) outcome = selector.get_support(True) new_features = [] # The list of your K best features for ind in outcome: new_features.append(data.columns.values[ind]) return pd.DataFrame(data_new, columns=new_features)
def feature_SelectFdr(x_data, y_data): bestfeatures = SelectFdr(f_classif, alpha=0.01) fit = bestfeatures.fit(x_data, y_data) dfscores = pd.DataFrame(fit.scores_) dfcolumns = pd.DataFrame(x_data.columns) featureScores = pd.concat([dfcolumns, dfscores], axis=1) featureScores.columns = ['Specs', 'Score'] # naming the dataframe columns top_20_features = featureScores.nlargest(20, 'Score') return top_20_features
def feature_select(labels, features, alfa=0.4): dct = DecisionTreeClassifier(random_state=42) rfecv1 = RFECV(estimator=dct, step=1, cv=StratifiedKFold(labels, n_folds=6, shuffle=True, random_state=42), scoring='recall') rfecv2 = RFECV(estimator=dct, step=1, cv=StratifiedKFold(labels, n_folds=6, shuffle=True, random_state=42), scoring='precision') rfecv1.fit(features, labels) rfecv2.fit(features, labels) print("Optimal number of features - Recall : %d" % rfecv1.n_features_) print("Optimal number of features - Precision : %d" % rfecv2.n_features_) BestFeatures = SelectFdr(score_func=f_classif, alpha=alfa) BestFeatures.fit_transform(features, labels) # BestFeatures = SelectKBest(score_func=f_classif,k=numbest) # BestFeatures.fit_transform(features,labels) feature_scores = BestFeatures.scores_ feature_pvalues = BestFeatures.pvalues_ best_feat_indices = BestFeatures.get_support(indices=True) best_list = [] for i in range(len(best_feat_indices)): best_list.append(features_list[best_feat_indices[i] + 1]) print 'Best features:', best_list feat_ctr = -1 for index in best_feat_indices: feat_ctr += 1 print best_list[feat_ctr], 'Score:', feature_scores[ index], 'P-value:', feature_pvalues[index] plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Recall & Precision") plt.plot(range(1, len(rfecv1.grid_scores_) + 1), rfecv1.grid_scores_, label='Recall', color='blue') plt.plot(range(1, len(rfecv2.grid_scores_) + 1), rfecv2.grid_scores_, label='Precision', color='green') plt.legend() plt.show()
def test_select_fdr_int(self): model = SelectFdr() X, y = load_breast_cancer(return_X_y=True) model.fit(X, y) model_onnx = convert_sklearn( model, "select fdr", [("input", Int64TensorType([None, X.shape[1]]))], target_opset=TARGET_OPSET) self.assertTrue(model_onnx is not None) dump_data_and_model( X.astype(np.int64), model, model_onnx, basename="SklearnSelectFdr")
def test_select_fdr_int(self): model = SelectFdr() X, y = load_breast_cancer(return_X_y=True) model.fit(X, y) model_onnx = convert_sklearn( model, 'select fdr', [('input', Int64TensorType([1, X.shape[1]]))]) self.assertTrue(model_onnx is not None) dump_data_and_model( X.astype(np.int64), model, model_onnx, basename="SklearnSelectFdr", allow_failure= "StrictVersion(onnx.__version__) < StrictVersion('1.2')")
def select_fdr(input_data, feature_names=None, score_func=f_classif, alpha=0.05): if score_func == f_classif: input_data, feature_names, _ = remove_constant(input_data, feature_names) x_train = input_data[0] y_train = input_data[1] x_test = input_data[2] y_test = input_data[3] dims = len(x_train.shape) if dims == 3: x_train = flatten(x_train) x_test = flatten(x_test) done = False increment = alpha while not done: feature_selector = SelectFdr(score_func=score_func, alpha=alpha) temp_x_train = feature_selector.fit_transform(x_train, y_train) temp_x_test = feature_selector.transform(x_test) if temp_x_train.shape[1] > 1 and temp_x_test.shape[1] > 1: done = True x_train = temp_x_train x_test = temp_x_test else: msg = 'Feature selection was too aggresive, ' msg += 'increasing alpha from {} to {}'.format( alpha, alpha + increment) alpha += increment logging.warning(msg) if dims == 3: x_train = make3D(x_train) x_test = make3D(x_test) output_data = (x_train, y_train, x_test, y_test) if feature_names is not None: mask = feature_selector.get_support() feature_names = feature_names[mask] logging.info('Selected {} features'.format(x_train.shape[1])) final_args = {'score_func': score_func, 'alpha': alpha} return output_data, feature_names, final_args
def SelectComorbidTraits(self,FDR,modifyDataset=False,useChi2=True): """ Selects features (symptoms) correlated with some dichotomous variable (disease diagnosis), hence co-morbid. This dichotomous variable is automatically inferred from ClinicalDatasetSampler, as it is whatever the sampler is conditioned on. Parameters ---------- FDR : float False discovery rate cut off for feature selection modifyDataset : bool If True, then features that faile to be selected will be dropped from the dataset. useChi2 : bool By default, uses chi-sq test to estimate co-morbidity between featureVector and features. If False, then Fisher's exact test is used. Returns ------- tuple of arrays (Index of selected features, Feature Scores ,Feature P-values) """ assert self.sampler.isConditioned==True,"Cannot perform feature selection without being conditioned on some disease of interest" previousArrayType = self.sampler.returnArrays if self.sampler.returnArrays!='Sparse': self.sampler.ChangeArrayType('Sparse') sparseTrainingData=self.sampler.ReturnFullTrainingDataset(randomize=False) dataMatrix=sparseTrainingData[0] incidenceVec =sparseTrainingData[2] if useChi2==False: fdr=SelectFdr(fisher_exact, alpha=FDR) else: fdr=SelectFdr(chi2, alpha=FDR) fdr_fit = fdr.fit(dataMatrix,incidenceVec.toarray()) discIndx=np.where(fdr_fit.get_support()==True)[0] if modifyDataset: self.sampler.currentClinicalDataset.IncludeOnly([self.sampler.currentClinicalDataset.dataIndexToDxCodeMap[x] for x in discIndx]) if previousArrayType!='Sparse': self.sampler.ChangeArrayType(previousArrayType) return discIndx, fdr_fit.scores_[discIndx],fdr_fit.pvalues_[discIndx]
def select_fdr(df, target_col): y = df[target_col] X = df.drop(target_col, axis=1) selector = SelectFdr(chi2, alpha=0.01).fit(X, y) true_list = list(selector.get_support()) index = [i for i in range(len(true_list)) if true_list[i] == True] if len(index) == 0: print( 'No features were selected: either the data is too noisy or the selection Test_data too strict.' ) return df else: saved_columns = [list(X.columns)[i] for i in index] result = pd.DataFrame(selector.transform(X), columns=saved_columns) result[target_col] = y return result
def build_trained_model(training_data, classifier='svc'): alpha = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10] ridge_params = {'alpha': alpha} c_s = [0.01, 0.1, 1.0, 10.0, 100.0] gamma = [1e-4, 1e-3, 1e-2, 1e-1, 1, 10] svc_params = [{ 'kernel': ['rbf'], 'gamma': gamma, 'C': c_s }, { 'kernel': ['linear'], 'C': c_s }] if classifier == 'svc': clf = GridSearchCV(SVC(probability=True), svc_params, cv=5) # clf = GridSearchCV(SVC(probability=True, class_weight='balanced'), svc_params, cv=5) elif classifier == 'ridge': clf = GridSearchCV(RidgeClassifier(), ridge_params, cv=5) else: raise NotImplementedError( "Only 'svc' (default) and 'ridge' classifiers are supported") pipe = Pipeline([('standard_scalar', StandardScaler()), ('feature_selection', SelectFdr()), ('classification', clf)]) with warnings.catch_warnings(): warnings.simplefilter('ignore') pipe.fit(training_data.ix[:, :-3], training_data.ix[:, -3].astype('int')) return pipe
def test_select_fdr_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the fdr heuristic """ X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectFdr(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode="fdr", param=0.01).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def test_select_fdr_float(self): model = SelectFdr() X, y = load_breast_cancer(return_X_y=True) model.fit(X, y) model_onnx = convert_sklearn( model, "select fdr", [("input", FloatTensorType([1, X.shape[1]]))]) self.assertTrue(model_onnx is not None) dump_data_and_model( X.astype(np.float32), model, model_onnx, basename="SklearnSelectFdr", allow_failure="StrictVersion(onnx.__version__)" " < StrictVersion('1.2') or " "StrictVersion(onnxruntime.__version__)" " <= StrictVersion('0.2.1')", )
def SelectComorbidTraits_ContinuousFeature(self,featureVector,FDR,modifyDataset=False,use_ttest=False): """ Selects features correlated with some continuous variable. Parameters ---------- featureVector : [float] Vector of floating values for feature selection. Must be sorted in the same order as the index for the ClinicalDatasetSampler training dataset. FDR : float False discovery rate cut off for feature selection modifyDataset : bool If True, then features that faile to be selected will be dropped from the dataset. use_ttest : bool By default, uses F-test to estimate correlation between featureVector and features. If True, instead uses T-test to perform association. Returns ------- tuple of arrays (Index of selected features, Feature Scores ,Feature P-values) """ previousArrayType = self.sampler.returnArrays if self.sampler.returnArrays!='Sparse': self.sampler.ChangeArrayType('Sparse') sparseTrainingData=self.sampler.ReturnFullTrainingDataset(randomize=False) dataMatrix=sparseTrainingData[0] if use_ttest: fdr=SelectFdr(T_test, alpha=FDR) else: fdr=SelectFdr(f_regression, alpha=FDR) fdr_fit = fdr.fit(dataMatrix,featureVector.ravel()) discIndx=np.where(fdr_fit.get_support()==True)[0] if modifyDataset: self.sampler.currentClinicalDataset.IncludeOnly([self.sampler.currentClinicalDataset.dataIndexToDxCodeMap[x] for x in discIndx]) if previousArrayType!='Sparse': self.sampler.ChangeArrayType(previousArrayType) return discIndx, fdr_fit.scores_[discIndx],fdr_fit.pvalues_[discIndx]
def test_verbose_output_for_select_select_fdr(): expected_output = ("The p-value of column 'B' (1.0000) is above the " + "specified alpha of 0.5000") model = SelectFdr(chi2, alpha=0.5) output = _capture_verbose_output_for_model(model, use_supervised_df=True) assert output == expected_output
def featureFitting( filename, X, y, featureNames,optimalFlag, kbest=20, alpha=0.05,model=None): ''' Gets the K-best features (filtered by FDR, then select best ranked by t-test , more advanced options can be implemented). Save the data/matrix with the resulting/kept features to a new output file, "REDUCED_Feat.csv" ''' a=alpha FD = SelectFdr(alpha=a) X = FD.fit_transform(X,y) selectK = SelectKBest(k=kbest) selectK.fit(X,y) selectK_mask=selectK.get_support() K_featnames = featureNames[selectK_mask] print("K_featnames: %s" %(K_featnames)) Reduced_df = pd.read_csv(filename, index_col=0) Reduced_df = Reduced_df[Reduced_df.columns[selectK_mask]] Reduced_df.to_csv('REDUCED_Feat.csv') return Reduced_df
def test_select_fdr_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the fdr heuristic """ X, y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectFdr(f_regression, alpha=0.01) X_r = univariate_filter.fit(X, y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode='fdr', param=0.01).fit(X, y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def selectFdr(args): """Uses scikit-learn's SelectFdr, select the p-values for an estimated false discovery rate. Parameters ---------- score_func : callable Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues). alpha : float, optional The highest uncorrected p-value for features to keep. """ if (args[2] == "chi2"): selector = SelectFdr(chi2, alpha=float(args[1])) elif (args[2] == "f_classif"): selector = SelectFdr(f_classif, alpha=float(args[1])) return selector
def build_trained_model(training_data): pipe = Pipeline([('scaler', StandardScaler()), ('feature_selection', SelectFdr()), ('classification', SVC(probability=True))]) with warnings.catch_warnings(): warnings.simplefilter('ignore') pipe.fit(training_data.ix[:, :-3], training_data.ix[:, -3].astype('int')) return pipe
def select_fdr(args): # https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFdr.html from sklearn.feature_selection import f_classif, chi2 if args['alpha'] is None: args['alpha'] = 0.05 if args['score_function'] == 'chi2': args['score_function'] = chi2 elif args['score_function'] == 'f_classif': args['score_function'] = f_classif return SelectFdr(score_func=args['score_function'], alpha=args['alpha'])
def test_pipeline_shares_structure(): pipeline = dl.Pipeline([("scale", StandardScaler()), ("fdr", SelectFdr()), ("svm", LinearSVC())]) pipeline1 = pipeline.fit(X, y) score1 = pipeline1.score(X, y) pipeline2 = pipeline.set_params(svm__C=0.1) pipeline2 = pipeline2.fit(X, y) score2 = pipeline2.score(X, y) assert (len(merge(score1.dask, score2.dask)) <= (len(score1.dask) + len(score2.dask)) * 0.75) assert score1.key != score2.key
def feature_selection(df,tgt,mtd,slct=10): '''function to do feature selection for the target specified by tgt and using the method specified by mtd''' target = df[tgt] features = df.drop([tgt], axis=1) if mtd == 'KBest': bestfeatures = SelectKBest(score_func=f_classif, k=slct) fit = bestfeatures.fit(features,target) dfscores = pd.DataFrame(fit.scores_) dfcolumns = pd.DataFrame(features.columns) #dfpvalues = pd.DataFrame(fit.pvalues_) #not providing useful insight featureScores = pd.concat([dfcolumns,dfscores],axis=1) featureScores.columns = ['Features','Score'] #naming the dataframe columns featureScores = featureScores.sort_values(by=['Score'],ascending=False) #sort to see most important features at the top select_cols = features.columns.values[fit.get_support()] #get_support returns a boolean vector indicating which features (columns' names) were selected selectfeatures = pd.DataFrame(fit.transform(features),columns = select_cols) #build dataframe with selected features only elif mtd == 'Fdr': bestfeatures = SelectFdr(score_func=f_classif, alpha=0.05) fit = bestfeatures.fit(features,target) dfscores = pd.DataFrame(fit.scores_) dfcolumns = pd.DataFrame(features.columns) #dfpvalues = pd.DataFrame(fit.pvalues_) #not providing useful insight featureScores = pd.concat([dfcolumns,dfscores],axis=1) featureScores.columns = ['Features','Score'] #naming the dataframe columns featureScores = featureScores.sort_values(by=['Score'],ascending=False) #sort to see most important features at the top select_cols = features.columns.values[fit.get_support()] #get_support returns a boolean vector indicating which features (columns' names) were selected selectfeatures = pd.DataFrame(fit.transform(features),columns = select_cols) #build dataframe with selected features only elif mtd == 'Fwe': bestfeatures = SelectFwe(score_func=f_classif, alpha=0.05) fit = bestfeatures.fit(features,target) dfscores = pd.DataFrame(fit.scores_) dfcolumns = pd.DataFrame(features.columns) #dfpvalues = pd.DataFrame(fit.pvalues_) #not providing useful insight featureScores = pd.concat([dfcolumns,dfscores],axis=1) featureScores.columns = ['Features','Score'] #naming the dataframe columns featureScores = featureScores.sort_values(by=['Score'],ascending=False) #sort to see most important features at the top select_cols = features.columns.values[fit.get_support()] #get_support returns a boolean vector indicating which features (columns' names) were selected selectfeatures = pd.DataFrame(fit.transform(features),columns = select_cols) #build dataframe with selected features only elif mtd == 'Pct': bestfeatures = SelectPercentile(score_func=f_classif, percentile=20) fit = bestfeatures.fit(features,target) dfscores = pd.DataFrame(fit.scores_) dfcolumns = pd.DataFrame(features.columns) #dfpvalues = pd.DataFrame(fit.pvalues_) #not providing useful insight featureScores = pd.concat([dfcolumns,dfscores],axis=1) featureScores.columns = ['Features','Score'] #naming the dataframe columns featureScores = featureScores.sort_values(by=['Score'],ascending=False) #sort to see most important features at the top select_cols = features.columns.values[fit.get_support()] #get_support returns a boolean vector indicating which features (columns' names) were selected selectfeatures = pd.DataFrame(fit.transform(features),columns = select_cols) #build dataframe with selected features only return select_cols #selectfeatures featureScores
def get_fsmethod (fsmethod, n_feats, n_subjs, n_jobs=1): if fsmethod == 'stats': return 'stats', None #Feature selection procedures #http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html fsmethods = { 'rfe' : RFE(estimator=SVC(kernel="linear"), step=0.05, n_features_to_select=2), #http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html 'rfecv' : RFECV(estimator=SVC(kernel="linear"), step=0.05, loss_func=zero_one), #cv=3, default; cv=StratifiedKFold(n_subjs, 3) #Univariate Feature selection: http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectPercentile.html 'univariate': SelectPercentile(f_classif, percentile=5), #http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFpr.html 'fpr' : SelectFpr (f_classif, alpha=0.05), #http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFdr.html 'fdr' : SelectFdr (f_classif, alpha=0.05), #http://scikit-learn.org/stable/modules/feature_selection.html 'extratrees': ExtraTreesClassifier(n_estimators=50, max_features='auto', compute_importances=True, n_jobs=n_jobs, random_state=0), 'pca' : PCA(n_components='mle'), 'rpca' : RandomizedPCA(random_state=0), 'lda' : LDA(), } #feature selection parameter values for grid search max_feats = ['auto'] if n_feats < 10: feats_to_sel = range(2, n_feats, 2) n_comps = range(1, n_feats, 2) else: feats_to_sel = range(2, 20, 4) n_comps = range(1, 30, 4) max_feats.extend(feats_to_sel) n_comps_pca = list(n_comps) n_comps_pca.extend(['mle']) fsgrid = { 'rfe' : dict(estimator_params = [dict(C=0.1), dict(C=1), dict(C=10)], n_features_to_select = feats_to_sel), 'rfecv' : dict(estimator_params = [dict(C=0.1), dict(C=1), dict(C=10)]), 'univariate': dict(percentile = [1, 3, 5, 10]), 'fpr' : dict(alpha = [1, 3, 5, 10]), 'fdr' : dict(alpha = [1, 3, 5, 10]), 'extratrees': dict(n_estimators = [1, 3, 5, 10, 30, 50], max_features = max_feats), 'pca' : dict(n_components = n_comps_pca, whiten = [True, False]), 'rpca' : dict(n_components = n_comps, iterated_power = [3, 4, 5], whiten = [True, False]), 'lda' : dict(n_components = n_comps) } return fsmethods[fsmethod], fsgrid[fsmethod]
def select_features(data, features, target, feature_selector='SelectKBest', k=10, alpha=0.05, score_func='f_classif'): X = data[features] y = data[target] if score_func == 'f_classif': score_func = f_classif elif score_func == 'f_regression': score_func = f_regression elif score_func == 'chi2': score_func = chi2 elif score_func == 'mutual_info_classif': score_func = mutual_info_classif elif score_func == 'mutual_info_regression': score_func = mutual_info_regression else: raise Exception('Undefined score_func') if feature_selector == 'SelectKBest': feature_selector = SelectKBest(score_func=score_func, k=k) elif feature_selector == 'SelectFpr': feature_selector = SelectFpr(score_func=score_func, alpha=alpha) elif feature_selector == 'SelectFdr': feature_selector = SelectFdr(score_func=score_func, alpha=alpha) else: raise Exception('Undefined score_func') feature_selector.fit_transform(X, y) feature_index = [ zero_based_index for zero_based_index in list(feature_selector.get_support( indices=True)) ] best_features = [] for i in feature_index: best_features.append(features[i]) print('Best features selected are: ' + str(best_features)) return best_features
def test_no_feature_selected(): rng = np.random.RandomState(0) # Generate random uncorrelated data: a strict univariate test should # rejects all the features X = rng.rand(40, 10) y = rng.randint(0, 4, size=40) strict_selectors = [ SelectFwe(alpha=0.01).fit(X, y), SelectFdr(alpha=0.01).fit(X, y), SelectFpr(alpha=0.01).fit(X, y), SelectPercentile(percentile=0).fit(X, y), SelectKBest(k=0).fit(X, y), ] for selector in strict_selectors: assert_array_equal(selector.get_support(), np.zeros(10)) X_selected = assert_warns_message( UserWarning, 'No features were selected', selector.transform, X) assert_equal(X_selected.shape, (40, 0))
def feature_sel(x, y, sel_method='estimator', k=None, estimator=None, score_func=chi2): """ :param x: :param y: :param k: :param sel_method: kbest, fdr, fpr, fwe, estimator, rfecv :param estimator: :param score_func: :return: """ if sel_method == 'kbest': assert k is not None selector = SelectKBest(score_func, k) elif sel_method == 'fdr': selector = SelectFdr(score_func, alpha=0.05) elif sel_method == 'fpr': selector = SelectFpr(score_func, alpha=0.05) elif sel_method == 'fwe': selector = SelectFwe(score_func, alpha=0.05) elif sel_method == 'estimator': assert estimator is not None if k is None: selector = SelectFromModel(estimator=estimator) else: selector = SelectFromModel(estimator=estimator, max_features=k, threshold=-np.inf) elif sel_method == 'rfecv': assert estimator is not None selector = RFECV(estimator, step=1, cv=5) else: raise Exception('unknown input parameters.') assert selector is not None x_new = selector.fit_transform(x, y) return selector.get_support(), x_new, y
def svm_cv(data, data_target): X_train, X_test, y_train, y_test = cross_validation.train_test_split(data, data_target) print "*" * 79 print "Training..." # selector = SelectFdr(chi2) selector = SelectFdr(f_classif) selector.fit(X_train, y_train) clf = svm.SVC(kernel='linear', probability=True) clf.fit(selector.transform(X_train), y_train) print "Testing..." pred = clf.predict(selector.transform(X_test)) probs = pred.predict_proba(selector.transfrom(X_test)) accuracy_score = metrics.accuracy_score(y_test, pred) classification_report = metrics.classification_report(y_test, pred) support = selector.get_support() print support print accuracy_score print classification_report precision, recall, thresholds = precision_recall_curve(y_test, probs[:, 1])
Kcv=4 #Number of stratified folds for cross validation. More = slower, more accurate. fileName = r'\trainingSetFeatures.csv' # filePath = r'E:\Dropbox\Dropbox\BioInformatics Lab\AA_Information\CODE\Feature_Extract\test_seq\Chap' filePath = str(input('Input DIRRectory containing TrainingData csv ')) ## features, labels, lb_encoder,featureNames = load_data(filename, 'file') features, labels, lb_encoder,featureNames = load_data(filePath+fileName, 'file') X, y = features, labels print('len(set(y)',len(set(y))) print(X.shape,"X = samples, features") scale = StandardScaler(copy=False) X = scale.fit_transform(X) FD = SelectFdr(alpha=0.0005) FD_K = SelectPercentile(percentile=70) X = FD.fit_transform(X,y) print(X.shape,"X post FDR alpha filter") X_FD = FD_K.fit_transform(X,y) print(X_FD.shape,"X post FDR+K-best alpha filter") print("\n BASE X models: \n") ModelParam_GridSearch(X,y,cv=Kcv) ''' pca = PCA(n_components='mle') X_PCA = pca.fit_transform(X) print(X_PCA.shape,"X - PCA,mle") ModelParam_GridSearch(X_PCA,y,cv=Kcv) '''