def rfecv_kfold(X_train, y_train, sk_model, out_dir, scoring="accuracy"): """RFECV(交差検証+再帰的特徴除去)""" def _plot_rfecv(selector): plt.xlabel("Number of features selected") plt.ylabel( "Cross validation score (nb of correct classifications)") plt.plot(range(1, len(selector.grid_scores_) + 1), selector.grid_scores_) plt.savefig(f"{out_dir}/plot_rfecv.png") # RFECVは交差検証+再帰的特徴除去。データでかいとメモリ死ぬので注意 # RFE(再帰的特徴除去=recursive feature elimination: すべての特徴量を使う状態から、1つずつ特徴量を取り除いていく)で特徴量選択 selector = RFECV(sk_model, cv=KFold(3, shuffle=True), scoring=scoring, n_jobs=-1) selector.fit(X_train, y_train) # 探索履歴plot _plot_rfecv(selector) # 選択した特徴量 select_cols = X_train.columns[selector.get_support()].to_list() print("\nselect_cols:\n", select_cols, len(select_cols)) # 捨てた特徴量 print("not select_cols:\n", X_train.columns[~selector.get_support()].to_list()) # 選択した特徴量保存 select_cols.append("y") pd.DataFrame({ "select_cols": select_cols }).to_csv(f"{out_dir}/rfecv_select_cols.csv", index=False)
def select_features_univariate(X, y, method='Decision_Tree'): """ with high dimensional datasets it aids classifier performance to select features of interest This function rejects features below a certain (univariate) threshold. Parameters ---------- X : ndarray repetitions by features y : ndarray vector of labels of each repetition method : string function used for data reduction {'decision_tree','decision_tree_RFECV','mutual_information',... 'univariate_select'} Returns -------- dictionary: X_transformed : ndarray repetitions by features (reduced) weights: ndarray or Boolean relative importance features or binary (important or not) """ # based on the method we choose the clf to fit and transform the data if method == 'decision_tree_RFECV': clf = DecisionTreeClassifier() trans = RFECV(clf) X_transformed = trans.fit_transform(X, y) weights = trans.get_support() elif method == 'decision_tree': clf = DecisionTreeClassifier() clf.fit(X, y) # choose features with an importance that is more than avg. selected_features = np.where( clf.feature_importances_ > clf.feature_importances_.mean(0), 1, 0) X_transformed = X[:, selected_features == 1] weights = clf.feature_importances_ elif method == 'mutual_information': mutual_info = mutual_info_classif(X, y) # choose features above the avg mutual information threshold. selected_features = np.where(mutual_info > mutual_info.mean(0), 1, 0) X_transformed = X[:, selected_features == 1] weights = mutual_info #continuous elif method == 'univariate_select': # select features with more univariate activity than avg. trans = GenericUnivariateSelect(score_func=lambda X, y: X.mean(axis=0), mode='percentile', param=50) X_transformed = trans.fit_transform(X, y) weights = trans.get_support() #binary return X_transformed, weights
def feature_selection(X, y, estimator, cv=5, n_jobs=2): """ Returns a list with the selected features. """ rfecv = RFECV(estimator=estimator, step=1, cv=cv, scoring='accuracy', n_jobs=n_jobs, verbose=0) rfecv.fit(X, y) features = rfecv.get_support(True) mask = rfecv.get_support() scores = rfecv.grid_scores_ return features, mask, scores
def RFECV_filter(df: DataFrame, y: Series, col_list: List, estimator: Any, keep: float = 0.5, step: int = 1, cv: int = 5) -> List: """ 递归特征(交叉验证)消除 :param df: :param y: :param col_list: :param estimator: 使用的学习器 :param keep: 保留特征数目或比例 :param step: 每次递归的步长 :param cv: 交叉验证折数 :return: """ if keep >= 1 and isinstance(keep, float): raise Exception('参数keep大于等于1时, 请输入整数') if isinstance(keep, float): keep = np.ceil(len(col_list) * keep) selector = RFECV(estimator, min_features_to_select=keep, step=step, cv=cv, scoring='roc_auc', n_jobs=-1) selector = selector.fit(df[col_list], y) mask = selector.get_support() res = np.array(col_list)[mask].tolist() return res
def find_best_features(df_train, y_train): rfr = RandomForestRegressor(n_estimators=500, max_depth=6, n_jobs=16) # vals_pearson = df_train.corr('pearson').values vals_pearson = joblib.load('vals_pearson.pkl') # vals_kendall = df_train.corr('kendall').values # vals_spearman = df_train.corr('spearman').values vals_spearman = joblib.load('vals_spearman.pkl') vals = (vals_pearson + vals_spearman) / 2 dumped_cols = [] res_cols = [True] * vals.shape[0] for i in range(vals.shape[0]): if i not in dumped_cols: for j in range(vals.shape[1]): if i != j: if abs(vals[i, j]) > 0.90: dumped_cols.append(j) res_cols[j] = False #df_train2 = df_train[df_train.columns[res_cols]] rfecv = RFECV( rfr, step=10, # Float step gives error on the end cv=5, scoring=rmse_scorer, verbose=2) # rfecv.fit(df_train2, y_train) rfecv = joblib.load('rfecv.pkl') return (res_cols, rfecv.get_support())
def feature_selection(X, Y, outcome, method, imp_method, data_dir, verbose=0): if method not in ['RFE', 'PCA', 'ElasticNet']: raise Exception("{} not supported.".format(method)) is_classf = Y.dtype == np.int8 feature_subset_path = os.path.join( data_dir, 'feature_subset_{}_{}_{}.h5'.format(outcome, method, imp_method)) if os.path.exists(feature_subset_path): if verbose: print("Feature subset already exists. Loading {}...".format( feature_subset_path)) with h5py.File(feature_subset_path, 'r') as hf: subset = hf[method][:] X_refined = X[:, subset] selector = None else: if method == 'RFE': if is_classf: selector = RFECV(LinearSVC(), step=0.1, cv=5, n_jobs=-1, verbose=verbose) else: selector = RFECV(LinearSVR(), step=0.1, cv=5, n_jobs=-1, verbose=verbose) X_refined = selector.fit_transform(X, Y) elif method == 'ElasticNet': selector = SelectFromModel(ElasticNetCV(cv=10, n_jobs=-1)) X_refined = selector.fit_transform(X, Y) else: selector = None pca_path = os.path.join( data_dir, 'pca_comp_{}_{}.pkl'.format(outcome, imp_method)) if os.path.exists(pca_path): print("PCA components already exist. Loading {}...".format( pca_path)) pca = joblib.load(pca_path) X_refined = pca.transform(X) else: var_thr = 0.99 pca = PCA() x_pca = pca.fit_transform(X) index_pca = np.argmax( pca.explained_variance_ratio_.cumsum() > var_thr) if verbose: print("Number of selected features:", index_pca) pca = PCA(n_components=index_pca) X_refined = pca.fit_transform(X) joblib.dump(pca, pca_path) if selector: with h5py.File(feature_subset_path, 'w') as hf: hf.create_dataset(method, data=selector.get_support()) return X_refined
def perform_feature_reduction(x, y): """ Performs feature reduction in the x, y For now, it uses linear SVR as estimator, and removes feature by feature. :param x: feature values :param y: labels :return: x, y, where x only contain the relevant features. """ estimator = SVR(kernel="linear") selector = RFECV(estimator, step=1, cv=N_CV_FEATURE_REDUCTION) log("Features before reduction (total of {}): {}".format( len(x.columns.values), ', '.join(x.columns.values))) selector.fit(x, y) x = x[x.columns[selector.get_support( indices=True)]] # keeping the column names log("Features after reduction (total of {}): {}".format( len(x.columns.values), ', '.join(x.columns.values))) log("Feature ranking: {}".format(', '.join( str(e) for e in selector.ranking_))) log("Feature grid scores: {}".format(', '.join( str(e) for e in selector.grid_scores_))) return x
def sele_fea(X,y): # X is the data; y is the age #X, y = make_friedman1(n_samples=50, n_features=10, random_state=0) estimator = SVR(kernel="linear") # step: corresponds to the (integer) number of features to remove at each iteration # cv: 为要分成的包的总个数 # selector = RFECV(estimator, step=1, cv=2) selector = selector.fit(X, y) sel_fea = selector.transform(X) # The sel_fea is with only the selected features fea_num = selector.n_features_ sel_index = selector.get_support(True) print("Optimal number of features : %d" % selector.n_features_) # Plot number of features VS. cross-validation scores plt.figure() # plt.annotate('',xy = (np.argmax(selector.grid_scores_) + 1,selector.grid_scores_[np.argmax(selector.grid_scores_,)]), xytext = (np.argmin(results[1:80]),3+results[np.argmin(results[1:80])]), arrowprops=dict(facecolor='red',shrink=20)) # plt.text(np.argmin(results[1:80])-6,(results[np.argmin(results[1:80])]-1),r'MAE = %.2f'%results[np.argmin(results[1:80])],fontsize = 10) # plt.text(np.argmin(results[1:80])-5,(results[np.argmin(results[1:80])]+3.5),r'K = %d'%np.argmin(results[1:80]),fontsize = 10) plt.xlabel("Number of features selected (K)") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(selector.grid_scores_) + 1), selector.grid_scores_) # plt.savefig('F:/BrainAging/SDSU/test/Results/panel5_TD_mae.png',format = 'png',dpi = 1000) plt.show() return (sel_fea, fea_num, sel_index)
def rfe_filter(feature_filter, finger_name, finger_feature): from sklearn.svm import SVC svc = SVC(kernel="linear") rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(5), scoring='roc_auc') rfecv.fit(finger_feature, label) rfecv_get = rfecv.get_support(indices=True) finger_three = finger_feature[rfecv_get] print " ", finger_three.shape print("Optimal number of features : %d" % rfecv.n_features_) plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) path_2 = unicode("C:\Users\Administrator\Desktop\BBB_database_2\指纹数据\特征图片", "utf-8") os.chdir(path_2) save_name = str(feature_filter) + "_" + str( rfecv.n_features_) + "_" + finger_name + "_" + "rfe.jpg" if save_name in os.listdir(path_2): save_name = str(feature_filter) + "_" + str( rfecv.n_features_ ) + "_" + finger_name + "_" + "pca" + "_" + "rfe.jpg" plt.savefig(save_name) path = unicode("C:\Users\Administrator\Desktop\BBB_database_2\指纹数据\多个指纹", "utf-8") os.chdir(path) return finger_three
def find_best_features(df_train, y_train): rfr = RandomForestRegressor(n_estimators=500, max_depth=6, n_jobs=16) # vals_pearson = df_train.corr('pearson').values vals_pearson = joblib.load("vals_pearson.pkl") # vals_kendall = df_train.corr('kendall').values # vals_spearman = df_train.corr('spearman').values vals_spearman = joblib.load("vals_spearman.pkl") vals = (vals_pearson + vals_spearman) / 2 dumped_cols = [] res_cols = [True] * vals.shape[0] for i in range(vals.shape[0]): if i not in dumped_cols: for j in range(vals.shape[1]): if i != j: if abs(vals[i, j]) > 0.90: dumped_cols.append(j) res_cols[j] = False # df_train2 = df_train[df_train.columns[res_cols]] rfecv = RFECV(rfr, step=10, cv=5, scoring=rmse_scorer, verbose=2) # Float step gives error on the end # rfecv.fit(df_train2, y_train) rfecv = joblib.load("rfecv.pkl") return (res_cols, rfecv.get_support())
class DFRFECV(BaseEstimator, TransformerMixin): def __init__(self, columns=None, **kwargs): self.columns = columns self.selector = RFECV(**kwargs) self.transform_cols = None self.stat_df = None def fit(self, X, y): self.columns = X.columns if self.columns is None else self.columns self.transform_cols = [x for x in X.columns if x in self.columns] self.selector.fit(X[self.transform_cols], y) self.stat_df = pd.DataFrame({ 'feature': X[self.transform_cols].columns, 'ranking': self.selector.ranking_, 'grid_score': self.selector.grid_scores_, 'support': self.selector.get_support() }) return self def transform(self, X): if self.transform_cols is None: raise NotFittedError( f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator." ) features = self.stat_df[self.stat_df['support']]['feature'].values new_X = X[features].copy() return new_X def fit_transform(self, X, y): return self.fit(X, y).transform(X)
def score_param(param, model, X, y, cv): # feature selection under these params selector = RFECV(model(**param), step=1, cv=cv) selector.fit(X, y) X_sel = selector.transform(X) # score for these params is CV score fitting on X_sel return np.mean(cross_val_score(model(**param), X_sel, y, cv=cv)), selector.get_support()
class RecursiveFeatureEliminationSelector(Transformer): type = 23 def __init__(self, param='lr', min_features=1): super().__init__("rfe_selector") self.input_type = [NUMERICAL, DISCRETE, CATEGORICAL] self.params = param self.min_features = min_features self.optional_params = ['lr', 'rf'] def operate(self, input_datanode: DataNode, target_fields=None): from sklearn.feature_selection import RFECV feature_types = input_datanode.feature_types X, y = input_datanode.data if target_fields is None: target_fields = collect_fields(feature_types, self.input_type) X_new = X[:, target_fields] n_fields = len(feature_types) irrevalent_fields = list(range(n_fields)) for field_id in target_fields: irrevalent_fields.remove(field_id) self.min_features = max(self.min_features, n_fields // 20) if self.model is None: if self.params == 'lr': from sklearn.linear_model import LogisticRegression base_model = LogisticRegression(solver='lbfgs') elif self.params == 'rf': from sklearn.ensemble import ExtraTreesClassifier base_model = ExtraTreesClassifier(n_estimators=100) else: raise ValueError('Invalid base model!') self.model = RFECV(base_model, cv=3, min_features_to_select=self.min_features) self.model.fit(X_new, y) _X = self.model.transform(X_new) is_selected = self.model.get_support() irrevalent_types = [feature_types[idx] for idx in irrevalent_fields] selected_types = [ feature_types[idx] for idx in target_fields if is_selected[idx] ] selected_types.extend(irrevalent_types) new_X = np.hstack((_X, X[:, irrevalent_fields])) new_feature_types = selected_types output_datanode = DataNode((new_X, y), new_feature_types, input_datanode.task_type) output_datanode.trans_hist = input_datanode.trans_hist.copy() output_datanode.trans_hist.append(self.type) self.target_fields = target_fields.copy() return output_datanode
def recursive_feature_selection(X, Y): svc = SVC(kernel="linear") rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(5), scoring='accuracy') rfecv.fit(X, Y) mask = rfecv.get_support() return X[:, mask]
def selectFeatures(self, select_model): selector = RFECV(estimator=select_model, step=self.step, cv=self.cv) y = self.train[self.label] X = self.train.drop(self.label, axis=1) select_X = selector.fit_transform(X, y) select_features_index = selector.get_support(True) select_columns = X.columns[select_features_index] return select_X, select_columns
def SelectRFE_DTCV(dataf, targetf): estimator = DecisionTreeClassifier() selector = RFECV(estimator, cv=3) data_new = selector.fit_transform(dataf.values, targetf.values.ravel()) outcome = selector.get_support(True) new_features = [] # The list of your K best features for ind in outcome: new_features.append(dataf.columns.values[ind]) return pd.DataFrame(data_new, columns=new_features)
def selectBestFeaturesRFECV(samples, classifications, featureNames, classifierClass): fs = RFECV(classifierClass.getEstimator()) if (not sprs.issparse(samples)): samples = sprs.csr_matrix(samples) samples = fs.fit_transform(samples.toarray(), classifications) sup = fs.get_support() featureNames = [featureNames[i] for (i,s) in enumerate(sup) if s] return [samples,featureNames]
def selectFeatures (clf, X, Y): # Create the RFE object and compute a cross-validated score. # The "accuracy" scoring is proportional to the number of correct # classifications rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(Y, 5), scoring='accuracy') rfecv.fit(X, Y) lst = rfecv.get_support() indices = find(lst, True) return X[:, indices], indices
def featureSelection(X,y): class RandomForestClassifierWithCoef(RandomForestClassifier): def fit(self, *args, **kwargs): super(RandomForestClassifierWithCoef, self).fit(*args, **kwargs) self.coef_ = self.feature_importances_ randfor = RandomForestClassifierWithCoef(n_estimators=35) rfecv = RFECV(estimator=randfor, step=1, cv=5, scoring='accuracy',verbose=2) rfecv.fit(X,y) return X.columns[rfecv.get_support()]
def recursive_feature_elimination_withCV(self, estimator, y_train=None, feats=None,n_fold=5, step=1, scoring='accuracy'): data1 = self.data.copy() cv_split = ShuffleSplit(n_splits=n_fold, test_size=.2, train_size=.7, random_state=42) # run model n_foldx with 70/20 split intentionally leaving out 10% clf_rfe = RFECV(estimator, step=step, scoring=scoring, cv=cv_split) if y_train is not None: clf_rfe.fit(data1[feats], y_train) else: clf_rfe.fit(data1[feats], data1['label']) X_rfe = data1[feats].columns.values[clf_rfe.get_support()] return X_rfe
def feature_selection(df, sample): """runs feature selection algorithm to calculate feature importance Parameters ---------- df : pd.DataFrame data sample : int flag variable, if whole datset take a sample Returns ------- None """ if not sample: #shuffle df = df.sample(frac=1) #df.head(df.shape[0] *80) df = df.head(500) y = df['hotel_cluster'] X = df.drop(columns=['hotel_cluster']) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=99) estimator = RandomForestClassifier(random_state=99, max_depth=10) print("Fitting RF Classifier for Feature Selection ... ") estimator.fit(X_train, y_train) selector = RFECV(estimator, cv=10, step=.50) print("Fitting feature selector ...") selector = selector.fit(X_train, y_train) print("FIT!") mask = selector.get_support() #list of booleans features = [] for b, feature in zip(mask, X_train.columns): if b: features.append(feature) print("Num feat: {}".format(selector.n_features_)) print("Features: {}".format(features)) plt.barh(range(X_train.shape[1]), estimator.feature_importances_, align='center') plt.yticks(np.arange(X_train.shape[1]), X_train.columns.values) plt.xlabel('Feature importance') plt.ylabel('Feature') plt.show()
def get_features(): df_train = pd.read_csv("train.csv") importance_features_sorted = pd.read_csv("feature_ranking.csv") importance_features_sorted = importance_features_sorted.rename( columns={"Unnamed: 0": "features"}) if request.method == 'POST': if request.form['important_features'].isnumeric(): number_of_features = int(request.form['important_features']) X = df_train.drop('labels', 1) target = df_train['labels'] estimator = LogisticRegression(penalty='l1', solver='saga', C=2, multi_class='multinomial', n_jobs=-1, random_state=42) rfecv = RFECV(estimator=estimator, step=1, cv=StratifiedShuffleSplit(1, test_size=.2, random_state=42), scoring='accuracy') select_features_by_model = importance_features_sorted[ importance_features_sorted['ranking'] <= number_of_features]['features'].tolist() rfecv.fit(X[select_features_by_model], target) plt.figure(figsize=(16, 9)) plt.title('Recursive Feature Elimination with Cross-Validation', fontsize=18, fontweight='bold', pad=20) plt.xlabel('Number of features selected', fontsize=14, labelpad=20) plt.ylabel('% Correct Classification', fontsize=14, labelpad=20) plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_, color='#303F9F', linewidth=3) plt.savefig('./static/features.png') rfecv_df = pd.DataFrame({'col': select_features_by_model}) rfecv_df['rank'] = np.nan for index, support in enumerate(rfecv.get_support(indices=True)): rfecv_df.loc[support, 'rank'] = index for index, rank in enumerate(rfecv.ranking_ - 2): if rank >= 0: rfecv_df.loc[index, 'rank'] = rfecv.n_features_ + rank rfecv_df.to_csv('features.csv') return redirect("/model") else: flash("Please enter a digit for the number of features to select!") return redirect("/feature")
def RFE_score(model, X, y): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1024, stratify=y) selector = RFECV(model, cv=3, scoring='f1') selector.fit(X_train, y_train) y_pred = selector.predict(X_test) score = f1_score(y_test, y_pred) return selector.get_support(indices=True), score
def SelectFeatures(featuresStructuresArray, labels): estimator = LogisticRegression('l2', False) featureNames = featuresStructuresArray.dtype.names featureData = castStructuredArrayToRegular(featuresStructuresArray) featuresSelector = RFECV(estimator, cv=8) featuresSelector.fit(featureData, labels) selectedIndices = featuresSelector.get_support() selectedFeatures = np.array(featureNames)[selectedIndices] return selectedFeatures
def selectFeatures(clf, X, Y): # Create the RFE object and compute a cross-validated score. # The "accuracy" scoring is proportional to the number of correct # classifications rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(Y, 5), scoring='accuracy') rfecv.fit(X, Y) lst = rfecv.get_support() indices = find(lst, True) return X[:, indices], indices
def SelectFeatures(featuresStructuresArray, labels): estimator = LogisticRegression('l2', False) featureNames = featuresStructuresArray.dtype.names featureData = castStructuredArrayToRegular(featuresStructuresArray) featuresSelector = RFECV(estimator, cv=8) featuresSelector.fit(featureData , labels) selectedIndices = featuresSelector.get_support() selectedFeatures = np.array(featureNames)[selectedIndices] return selectedFeatures
def selF(j, X, y, flist): rfe = RFECV(LinearRegression(), step=1, cv=5) X = rfe.fit(X, y.ravel()) #selectB =SelectKBest(f_regression, k=j) #X = selectB.fit_transform((X), y.ravel()) p = rfe.get_support() my_feat = list() for i in np.arange(0, len(p)): if p[i] == True: my_feat.append(flist[i]) print("Number of features after feature selection is", len(my_feat)) return my_feat
def RFECV_selector(train_x, train_y, k=10): from sklearn.svm import LinearSVC from sklearn.feature_selection import RFE from sklearn.feature_selection import RFECV svc = LinearSVC() # The "accuracy" scoring is proportional to the number of correct # classifications selection = RFECV(estimator=svc, step=1, scoring='accuracy') selection.fit(train_x, train_y) print( '----------------------------feature importance -------------------------' ) print selection.grid_scores_ importance = selection.grid_scores_ # print selection.n_features_ # print(selection.variances_) print( '----------------------------- selected feature -------------------------' ) print selection.get_support(indices=True) return selection, importance
def features_selection_method(name, params, X_train, y_train, problem_size): indices = [] if name == "variance_threshold": percent_to_keep = float(params) #sel = VarianceThreshold(threshold=(percent_to_keep * (1 - percent_to_keep))) sel = VarianceThreshold(threshold=(percent_to_keep)) sel.fit_transform(X_train) indices = sel.get_support(indices=True) if name == "kbest": k_param = int( float(params) * problem_size) # here it's a percent over the whole dataset model = SelectKBest(chi2, k=k_param).fit_transform(X_train, y_train) indices = model.get_support(indices=True) if name == "linearSVC": C_param = float(params) lsvc = LinearSVC(C=C_param, penalty="l1", dual=False).fit(X_train, y_train) model = SelectFromModel(lsvc, prefit=True) indices = model.get_support(indices=True) if name == "tree": n_estimarors_param = int(params) clf = ExtraTreesClassifier(n_estimators=n_estimarors_param) clf = clf.fit(X_train, y_train) model = SelectFromModel(clf, prefit=True) indices = model.get_support(indices=True) if name == "rfecv": cv_param = int(params) # Create the RFE object and compute a cross-validated score svc = SVC(kernel="linear") # The "accuracy" scoring is proportional to the number of correct # classifications rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(cv_param), scoring='roc_auc') rfecv.fit(X_train, y_train) indices = rfecv.get_support(indices=True) return indices
def select_features_with_rfe(data_X, data_Y, feature_names): result = [] svc = SVC(kernel="linear", C=1) rfecv = RFECV(estimator=svc, step=1, cv=3, scoring='accuracy') rfecv.fit(data_X, data_Y) print("RFE - Optimal number of features : %d" % rfecv.n_features_) for idx, val in enumerate(rfecv.get_support()): if val: print "RFE - Choosing feature: " + feature_names[idx] result.append(feature_names[idx]) return result
def rfecv(df, columns, target_col): X = df[columns] y = df[target_col] estimator = SVR(kernel="linear") selector = RFECV(estimator, step=1, cv=len(columns)) selector = selector.fit(X, y) data = selector.transform(X) # get kept columns true_list = list(selector.get_support()) index = [i for i in range(len(true_list)) if true_list[i] == True] saved_columns = [columns[i] for i in index] # save into dataframe result = pd.DataFrame(data, columns=saved_columns) result[target_col] = y return result
def recursive_feature_elimination_cv(input_data, feature_names, step=0.1, cv=3, estimator=SVC(kernel='linear')): """ Recursively elinates features from x_train and x_test with cross validation, uses scikit-learn's RFECV see documentation: http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html If feature_names is given it is also returned with any features from x_train and x_test also removed from feature_names. Args: input_data (tuple): x_train, y_train, x_test, y_test feature_names: The names of all features before feature selection or None. estimator (object): Passed to RFECV, see documentation step (int or float): Passed to RFECV, see documentation cv (int): Passed to RFECV, see documentation Returns: tuple: (x_train, y_train, x_test, y_test), feature_names, input_args """ x_train = input_data[0] y_train = input_data[1] x_test = input_data[2] y_test = input_data[3] dims = len(x_train.shape) if dims == 3: x_train = flatten(x_train) x_test = flatten(x_test) feature_selector = RFECV(estimator, step, cv) x_train = feature_selector.fit_transform(x_train, y_train) x_test = feature_selector.transform(x_test) if dims == 3: x_train = make3D(x_train) x_test = make3D(x_test) output_data = (x_train, y_train, x_test, y_test) if feature_names is not None: mask = feature_selector.get_support() feature_names = feature_names[mask] args = {'step': step, 'cv': cv, 'estimator': estimator} return output_data, feature_names, args
def variable_selection_model_fitting(train, test, model, columns): train_x, train_y = split_x_y(train.values) test_x, test_y = split_x_y(test.values) selection_model = LogisticRegression() rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(train_y, 10), scoring='accuracy') selector = rfecv.fit(train_x, train_y) rfe_features = [] print rfecv.n_features_ for col, selected in zip(columns, rfecv.get_support()): if selected: rfe_features.append(col) print rfe_features return rfe_features
def svc_rfe_cv(dataset, label): """ Performing recursive feature elimination using support vector classifier with 10 fold cross validation Args: dataset - training data label - trainig data labels Returns: A list of most informative columns according to SVC_RFE """ estimator = SVC(kernel="linear") selector = RFECV(estimator, min_features_to_select=100, step=1, cv=10) selector = selector.fit(dataset, label) training_data = dataset[dataset.columns[selector.get_support()]] return training_data
def recursive_feature_elimination(self, x: np.ndarray, y: np.ndarray, clf=None) -> np.ndarray: selector = RFECV(estimator=clf, step=1, cv=StratifiedKFold(y), scoring='accuracy', verbose=True) print("begin eliminate") selector.fit(x, y) print("Optimal number of features : %d" % selector.n_features_) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(selector.grid_scores_) + 1), selector.grid_scores_) plt.show() selected_features = self.features[selector.get_support()] print(selected_features) x = selector.transform(x) return x
def train_test(X_train, Y_train, X_test, Y_test, cv_params, custom_grid=False): if custom_grid: random_grid = load_grid(custom_grid) else: alpha = np.linspace(30000, 20000, 500) #solver = ['svd', 'cholesky', 'lsqr'] # Create the random grid random_grid = {'alpha': alpha} #'solver' : solver} print_grid(random_grid) estimator = Ridge(alpha=90000) ridge_random = RFECV(estimator, step=500, cv=5, verbose=10) # Random search of parameters, using 3 fold cross validation, # search across 100 different combinations, and use all available cores #ridge_random = RandomizedSearchCV(selector, param_distributions = random_grid, n_iter = cv_params["n_iter"], # cv = cv_params["cv"], verbose=10, random_state=42, n_jobs = cv_params["n_jobs"], # pre_dispatch='2*n_jobs') ridge_random.fit(X_train, Y_train) best_grid_params = {'alpha': 30000} best_random = ridge_random.get_support() best_model_params = ridge_random.get_params() train_predictions = ridge_random.predict(X_train) test_predictions = ridge_random.predict(X_test) #metrics r_train = pearsonr(Y_train, train_predictions) r_test = pearsonr(Y_test, test_predictions) mse_train = mse(Y_train, train_predictions) mse_test = mse(Y_test, test_predictions) metrics = { "r_train": r_train, "r_test": r_test, "mse_train": mse_train, "mse_test": mse_test } print(f"pearsonr train: {r_train}") print(f"pearsonr test: {r_test}") print(f"mse train: {mse_train}") print(f"mse test: {mse_test}") print(best_model_params) return best_grid_params, best_model_params, train_predictions, test_predictions, metrics, {}
def run_feature_select(SEED): numFeatures = 80 trainBaseTarget = pd.read_csv('../data/pre_shuffled_target.csv') trainBase = pd.read_csv('../data/pre_shuffled_train.csv') test = pd.read_csv('../data/pre_shuffled_test.csv') estimator = Ridge() selector = RFECV(estimator, step=20, cv=5, scoring=None) # NOT tested, must pass scoring function here. selector.fit(trainBase, trainBaseTarget) cols = selector.get_support(indices=False) print(selector.grid_scores_) print(selector.n_features_) p = np.vstack([trainBase.columns,selector.ranking_]) submission = pd.DataFrame(p.T, columns = None) submission.to_csv("../featureanalysis/RFECV_" + str(numFeatures) + ".csv") gc.collect() for index, col in enumerate(trainBase.columns): print("Column: " + col) if selector[index] == False and col != "var11": print("Dropping") trainBase.drop([col], axis=1, inplace=True) gc.collect() trainBase.to_csv("../models/RFECV_" + str(numFeatures) + "_train.csv", index = False) gc.collect() for index, col in enumerate(test.columns): print("Column: " + col) if cols[index] == False and col != "var11": print("Dropping") test.drop([col], axis=1, inplace=True) gc.collect() test.to_csv("../models/RFECV_" + str(numFeatures) + "_test.csv", index = False) gc.collect()
'http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html ' svc = LinearSVC(class_weight='auto')#,penalty='l1',dual=False) # svc = LogisticRegression(class_weight='auto')#,C=1) if FeatSelection_RFECV==True: rfecv = RFECV(estimator=svc, step=0.1, cv=StratifiedShuffleSplit(y,n_iter=7,test_size=0.33), scoring='f1',verbose=0) # " scoring='roc_auc','recall','f1'..." else: rfecv = RFE(estimator=svc,n_features_to_select=RFE_FeatsToKeep, step=0.1) rfecv.fit(X, y) if FeatSelection_RFECV==True: print("RFEcv selected %d number of Optimal features : " % (rfecv.n_features_)) print("RFE (%d Features) scorer : \n" % (rfecv.n_features_),rfecv.score(X, y) ) print("RFE selected feature names:") featureNames=featureNames[rfecv.get_support()] rfe_featnames = featureNames[rfecv.get_support()] print (rfe_featnames) X_RFE = rfecv.fit_transform(X, y) print(X_RFE.shape,"X_RFE \n") 'Set GetRFEPerf To true or by user, if perf. of reduced set wanted' GetRFEPerf=False print("\n X: \n") ModelParam_GridSearch(X,y,cv=4) if GetRFEPerf==True: print("\n X-RFE: \n") ModelParam_GridSearch(X_RFE,y,cv=4)
def FeatureSelection(data_dict, features_list): # Convert dictionary to numpy array, converts NaN to 0.0 data = featureFormat(data_dict, features_list, \ sort_keys = True, remove_all_zeroes = False) # Separate into labels = 'poi' and features = rest of features_list labels, features = targetFeatureSplit(data) from sklearn.feature_selection import RFECV # Recursive Feature Elimination with Cross Validation from sklearn.svm import SVC # Support Vector Classifier to estimate fit coefficients for each feature from sklearn.cross_validation import StratifiedShuffleSplit # cross validation maintain roughly equal number of POIs in each split ### Create Estimator # which will update the coefficients with each iteration # class weight is set to auto because of unbalanced data classes # weight will be inversely proportional to class size svc = SVC(kernel='linear', class_weight='auto', random_state=42) ############## Scale features ###################### # SVC algorithm requires use scaled features # missing values are coded 0.0, so MinMax will preserve those zero values from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() features = scaler.fit_transform(features) ### Select cross-validation method # StratifiedShuffleSplit keeps roughly the same number of POIs in each split sss = StratifiedShuffleSplit(labels, 100, test_size=0.3, random_state=42) ### Select evaluation metric # Evaluate model using f1 = 2 * (precision * recall) / (precision + recall) # Model should be able to predict POIs, which are a small percentage of cases metric = 'f1' # run the feature eliminater rfecv = RFECV(estimator=svc, cv=sss, scoring=metric, step=1) rfecv = rfecv.fit(features, labels) # view results import matplotlib.pyplot as plt plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score using F1 (precision&recall)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) # plt.savefig('featureSelection.png', transparent=True) plt.show() print("Optimal number of features is %d" % rfecv.n_features_) print('Features selected by recursive feature elimination with cross validation:') F1_score = round(rfecv.grid_scores_[rfecv.n_features_], 3) print('F1 score from optimal features: %r' % F1_score) selection = rfecv.get_support() selected_features = ['poi'] rejected_features = [] for i in range(len(selection)): if selection[i]: selected_features.append(features_list[i + 1]) # first feature is 'poi'=the label else: rejected_features.append(features_list[i + 1]) print(selected_features[1:]) print('Features eliminated:') print(rejected_features) return selected_features, F1_score
# 0.77320439 0.77538867 0.75253823 0.76103865 0.77505282 0.75834188 # 0.757514 0.76883208 0.77124053 0.7578164 0.76844945 0.76673323 # 0.76369039] ## let's plot out the results plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (ROC_AUC)") plt.plot(range(1, len(rfe_cv.grid_scores_) + 1), rfe_cv.grid_scores_) plt.show() # notice you could have just as well have included the 4 most important # features and received similar accuracy. # you can pull out the features used this way: features_used = explanatory_df.columns[rfe_cv.get_support()] print features_used # Index([u'atbats', u'totalruns', u'shutouts', u'teamID_Nothing'], dtype='object') # you can extract the final selected model object this way: final_estimator_used = rfe_cv.estimator_ # you can also combine RFE with grid search to find the tuning # parameters and features that optimize model accuracy metrics. # do this by passing the RFECV object to GridSearchCV. from sklearn.grid_search import GridSearchCV # doing this for a small range so I can show you the answer in a reasonable amount of time. depth_range = range(4, 6) # notice that in param_grid, I need to prefix estimator__ to my paramerters. param_grid = dict(estimator__max_depth=depth_range)
print "Optimal number of features: {0} of {1} considered".format(rfe_cv.n_features_, len(df_mod.columns)) # pritning out socres as we increas the number of features -- the farther down the list # the higher the number of features considered. print rfe_cv.grid_scores_.mean() # let's plot out the results plt.figure() plt.xlabel('Number of Features selected') plt.ylabel('Cross Validation score (ROC_AUC)') plt.plot(range(1, len(rfe_cv.grid_scores_)+1),rfe_cv.grid_scores_) plt.show() features_used = df.columns[rfe_cv.get_support()] print features_used # you can extract the final selected model object his way final_estimator_used = rfe_cv.estimator_ # perform grid search to find the optimal number of trees trees_range = range(10,550,10) param_grid = dict(n_estimators = trees_range) grid_rf = GridSearchCV(rf, param_grid, cv=10, scoring = 'roc_auc', verbose = 3) grid_rf.fit(df_mod, response_series) # check out the scores of the grid search grid_rf_mean_scores = [result[1] for result in grid_rf.grid_rf_scores_]
mean_score, scores.std() / 2, params) log.info(clf.best_estimator_) else: clf.fit(train_x, train_y) if model['name'] == 'Logistic Regression Classifier': # Recurive feature selection with 10-fold cross validation rfecv = RFECV(estimator=clf, step=1, cv=10, scoring='roc_auc') rfecv.fit(train_x, train_y) clf_tmp = rfecv.estimator_ mask = rfecv.get_support() log.debug('Logistic Regression Feature Estimates') for i in xrange(len(train_x.columns[mask])): log.debug(': '.join([train_x.columns[mask][i], str(clf_tmp.coef_[0][i])])) log.debug("Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores plt.figure() plt.title("Optimal number of features: %d" % rfecv.n_features_) plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.savefig('./figs/results/%s_feature_selection.png' % model['name']) clf = rfecv pickle.dump(clf, open('pickled_objects/%s_classifier' % model['name'], "wb"))
def main(args): if args.train_dir is None: # args.train_dir = '/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/chap/train/' #args.train_dir = '/cs/prt3/danofer/ProtFeat/feat_extract/test_seq/NP/SPCleaved_NP-70+NEG-30_Big-V3/' # args.train_dir = r'D:\SkyDrive\Dropbox\bioInf_lab\AA_info\CODE\feat_extract\test_seq\NP\SPCleaved_NP-70+NEG-30_Big-V3' # args.train_dir = r'E:\Dropbox\Dropbox\bioInf_lab\AA_info\fastas\NP\SP_Cleaved+NP+Neg_Big' args.train_dir = r'E:\Dropbox\Dropbox\bioInf_lab\AA_info\fastas\Benchmarks\Thermophiles' print("Using default train_dir: %s" % args.train_dir) pandas.set_option('display.max_columns', 10) pandas.set_option('display.max_rows', 4) # mpl.rc('title', labelsize=6) mpl.rc('ytick', labelsize=7) mpl.rc('xtick', labelsize=4) os.chdir(args.train_dir) dataName = 'Neuropeptides' df = pandas.read_csv('trainingSetFeatures.csv') feature_cols = [col for col in df.columns if col not in ['classname','Id','proteinname']] feature_cols=numpy.array(feature_cols) X = df[feature_cols].values y = df.classname.values le = LabelEncoder() y = le.fit_transform(y) "Initial feature selection trimming" print(X.shape) Fwe = SelectFwe(alpha=0.01).fit(X,y) X=Fwe.transform(X) print("F-test -> ",X.shape) feature_cols=feature_cols[Fwe.get_support()] ''' FeatSelection_SVM = True if FeatSelection_SVM == True: svc_L1 = LinearSVC(C=50, penalty="l1", dual=False,class_weight='auto').fit(X, y) X = svc_L1.transform(X, y) print ("L1 SVM Transformed X:",X_L1.shape) feature_cols=feature_cols[list(set(np.where(svc_L1.coef_ != 0)[-1]))] ''' k = SelectKBest(k=255).fit(X,y) X=k.transform(X) feature_cols=feature_cols[k.get_support()] param_dist = {"max_depth": [6,9, None], "max_features": ['auto',0.4], "min_samples_leaf": [1,2,3], "bootstrap": [True, False], 'min_samples_split':[2,3], "criterion": [ "gini"], "n_estimators":[100], "n_jobs":[-1]} rf = RandomForestClassifierWithCoef(max_depth= 7, min_samples_split= 1, min_samples_leaf= 2, n_estimators= 50, n_jobs= 2, max_features= "auto") "WARNING! F1 Score as implemented by Default in binary classification (two classes) gives the score for 1 class." scores = cross_validation.cross_val_score(rf,X,y,n_jobs=-1,cv=cross_validation.StratifiedShuffleSplit(y,n_iter=8,test_size=0.2)) print("X RF Accuracy: %0.3f (+- %0.2f)" % (scores.mean(), scores.std() * 2)) "Instead of scores_f1, we could also use precision, sensitivity, MCC (if binary), etc'." scores_f1 = cross_validation.cross_val_score(rf,X,y,n_jobs=-1,cv=cross_validation.StratifiedShuffleSplit(y,n_iter=8,test_size=0.2),scoring='f1') print("X RF f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2)) # rfeSelect = RFE(estimator=rf,n_features_to_select=16, step=0.04) rfeSelect = RFECV(estimator=rf,step=20, cv=2,scoring='f1') #average_precision , recall X_RFE = rfeSelect.fit_transform(X,y) print(X_RFE.shape) RFE_FeatureNames = feature_cols[rfeSelect.get_support()] print(RFE_FeatureNames) RFE_ScoreRatio = 100*(cross_validation.cross_val_score(rf,X_RFE,y,n_jobs=-1,cv=cross_validation.StratifiedShuffleSplit(y,n_iter=8,test_size=0.2),scoring='f1').mean())/scores_f1.mean() print("Even with just",X_RFE.shape[1]," features, we have %f performance! (f1 score ratio)" %(RFE_ScoreRatio)) # PlotFeaturesImportance(X_RFE, y, RFE_FeatureNames, dataName) print("Alt plot:") altPlotFeaturesImportance(X_RFE, y, RFE_FeatureNames, dataName)
def GetAllPerf (filePaths=None): if filePaths is None: filePaths = list(find_files(directory='./test_seq', pattern='trainingSetFeatures.csv')) #Sanity check: # filePaths=['/a/fr-05/vol/protein/danofer/ProtFeat/feat_extract/test_seq/Thermophile'] # filePaths=['./test_seq/NP/NP2/Train/trainingSetFeatures.csv'] print("FilePaths: \n",filePaths) fileNames=fileNameFromPaths (filePaths) print("FileNames:",fileNames) resDict = pd.DataFrame(index=fileNames, columns=['Accuracy','Accuracy_SD', 'f1','f1_SD','dummy_freq:Accuracy','dummy_freq:f1', 'LargestClassPercent','Classes', # 'TopRFE-Features','Best (f1) Model parameters', '# Classes', 'Array-Acc-Scores' ,'Array-f1-Scores' ,'bestML-Acc','bestML-f1','dummy_freq_f1_weighted']) #redDict holds results for each file/class, for saving to output-file i=-1 for filePath in filePaths: i +=1 'http://pythonconquerstheuniverse.wordpress.com/2008/06/04/gotcha-%E2%80%94-backslashes-in-windows-filenames/' filePath = os.path.normpath(filePath) print(filePath) fileName=str(fileNames[i]) #Str added now 14.1 print("fileName: %s" %(fileName)) "resDict['Name']= fileName" # filePath = str(argv[1]) # X, y, lb_encoder,featureNames = load_data(filePath+fileName, 'file') # X, y = features, labels X, y, lb_encoder,featureNames = load_data(filePath, 'file') # X, y = features, labels print(X.shape,"= (samples, features)") y_inv = Counter(lb_encoder.inverse_transform(y)) MajorityPercent = round(100*y_inv.most_common()[0][1]/sum(y_inv.values()),1) print("Classes:", lb_encoder.classes_) print("MajorityClassPercent:", MajorityPercent) resDict.LargestClassPercent[fileName] = MajorityPercent resDict.Classes[fileName] = str(lb_encoder.classes_) resDict["# Classes"][fileName]=len(lb_encoder.classes_) KFilt=None KFilt=350 #This is just temporary for the outputs - saves computation time. Barely filters compared to the model itself. if KFilt is not None: k = SelectKBest(k=KFilt).fit(X,y) X=k.transform(X) featureNames=featureNames[k.get_support()] Fwe = SelectFwe(alpha=0.01).fit(X,y) X=Fwe.transform(X) featureNames=featureNames[Fwe.get_support()] print("X reduced to K best features: ",X.shape) FeatSelection_SVM=False #Feature Names need updating!! FeatSelection_RandLogReg=False if FeatSelection_RandLogReg == True: LogRegFeats = RandomizedLogisticRegression(C=10, scaling=0.5, sample_fraction=0.95, n_resampling=40, selection_threshold=0.2,n_jobs=-1).fit(X,y) X_L1 = LogRegFeats.transform(X) featureNames=featureNames[LogRegFeats.get_support()] print("RandomizedLogisticRegression Feature Selection ->:",X_L1.shape) elif FeatSelection_SVM == True: svc_L1= LinearSVC(C=30, penalty="l2", dual=False,class_weight='auto').fit(X, y) X_L1 = svc_L1.transform(X, y) featureNames=featureNames[list(set(np.where(svc_L1.coef_ != 0)[-1]))] print ("L1 SVM Transformed X:",X_L1.shape) # X=X_L1 ''' print("Performance as a function of percent of features used:") PlotPerfPercentFeatures(X,y,est=LinearSVC()) ''' 'EG - graph best features; feature selection using RF, ensemble classifiers..' 'http://nbviewer.ipython.org/github/herrfz/dataanalysis/blob/master/assignment2/samsung_data_prediction_submitted.ipynb' RFE_FeatsToKeep = 16 FeatSelection_RFE=False FeatSelection_RFECV=False if (FeatSelection_RFE or FeatSelection_RFECV) == True: 'RFE + - best feats' 'http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html ' svc = LinearSVC(class_weight='auto')#,penalty='l1',dual=False) # svc = LogisticRegression(class_weight='auto')#,C=1) if FeatSelection_RFECV==True: rfecv = RFECV(estimator=svc, step=RFE_FeatsToKeep,scoring='average_precision') # ,cv=StratifiedShuffleSplit(y,n_iter=3,test_size=0.3)) #,scoring='f1',verbose=0) # " scoring='roc_auc','recall','f1',accuracy..." else: rfecv = RFE(estimator=svc,n_features_to_select=RFE_FeatsToKeep, step=0.03) rfecv.fit(X, y) if FeatSelection_RFECV==True: print("RFE-CV selected %d features : " % (rfecv.n_features_)) print("RFE (%d features) scorer : " % (rfecv.n_features_),rfecv.score(X, y) ) rfe_featnames = featureNames[rfecv.get_support()] featureNames = featureNames[rfecv.get_support()] print("RFE selected feature names:",rfe_featnames) X_RFE = rfecv.fit_transform(X, y) print("X_RFE",X_RFE.shape) resDict['TopRFE-Features'][fileName]=str(rfe_featnames) 'Set GetRFEPerf To true or by user, if perf. of reduced set wanted' GetRFEPerf=False # print("lb_encoder.classes_",lb_encoder.classes_) 'Blind score boxplot graphic example using Seaborn: http://nbviewer.ipython.org/github/cs109/2014/blob/master/homework-solutions/HW5-solutions.ipynb ' 'Confusion matrixes + Dummies - http://bugra.github.io/work/notes/2014-11-22/an-introduction-to-supervised-learning-scikit-learn/' 'http://scikit-learn.org/stable/modules/model_evaluation.html#dummy-estimators' "http://blog.yhathq.com/posts/predicting-customer-churn-with-sklearn.html" print() "Make custom F1 scorer. May not have fixed problem!" from sklearn.metrics.score import make_scorer f1_scorer = make_scorer(metrics.f1_score, greater_is_better=True, average="micro") #Maybe another metric? May NOT be fixed!?. #weighted, micro, macro, none # print("Dummy classifiers output:") dummy_frequent = DummyClassifier(strategy='most_frequent',random_state=0) y_dummyPred = Get_yPred(X,y,clf_class=dummy_frequent) dummy_freq_acc = '{:.3}'.format(metrics.accuracy_score(y,y_dummyPred )) dummy_freq_f1 = '{:.3}'.format(metrics.f1_score(y, y_dummyPred,average='weighted')) dummy_freq_f1_weighted = '{:.3}'.format(f1_scorer(y, y_dummyPred)) #Get from ALL classes f1.. dummy_freq_f1_mean=(metrics.f1_score(y, y_dummyPred,average=None)).mean() # print("Dummy, most frequent acc:",dummy_freq_acc) # dummy_stratifiedRandom = DummyClassifier(strategy='stratified',random_state=0) # dummy_strat2= '{:.3%}'.format(metrics.accuracy_score(y, Get_yPred(X,y,clf_class=dummy_frequent))) #,sample_weight=balance_weights(y))) # 'print("Dummy, Stratified Random:",dummy_strat2)' print() resDict['dummy_freq:Accuracy'][fileName]=dummy_freq_acc ## resDict['dummy_freq:f1'][fileName]=dummy_freq_f1 dummy_freq_f1_mean resDict['dummy_freq:f1'][fileName]=dummy_freq_f1_mean resDict['dummy_freq_f1_weighted'][fileName]=dummy_freq_f1_weighted # resDict.dummy_Stratfreq[fileName]=dummy_strat2 "We can get seperately the best model for Acc, and the best for f1!" "WARNING!? In binary case - default F1 works for the 1 class, in sklearn 15. and lower" # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1') "Temporary workaround until next SKlearn update of F1 metric:" # bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = 'f1')f1_scorer bestEst_f1,bestScore_f1 = ModelParam_GridSearch(X,y,cv=3,scoreParam = f1_scorer) bestEst_acc,bestScore_acc = ModelParam_GridSearch(X,y,cv=2,scoreParam = 'accuracy') print("bestEst (f1):",bestEst_f1)#,"best f1",bestScore_f1) print("bestEst (f1):",bestEst_acc)#,"best acc",bestScore_acc) #Temp # bestEst_f1=bestEst_acc=bestEst = RandomForestClassifier(n_jobs=-1) if GetRFEPerf==True: bestEst_RFE,bestScore_RFE = ModelParam_GridSearch(X_RFE,y,cv=3,scoreParam = 'f1') "Modified to get 2 estimators" scores_acc = cross_val_score(estimator=bestEst_acc, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1) #Accuracy print("Accuracy: %0.3f (+- %0.2f)" % (scores_acc.mean(), scores_acc.std() * 2)) scores_f1 = cross_val_score(estimator=bestEst_f1, X=X, y=y, cv=StratifiedShuffleSplit(y, n_iter=13, test_size=0.18), n_jobs=-1, scoring='f1') print("f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2)) resDict['Accuracy'][fileName]=round(scores_acc.mean(),4) resDict['Accuracy_SD'][fileName]=round(scores_acc.std(),4) resDict['f1'][fileName]=round(scores_f1.mean(),4) resDict['f1_SD'][fileName]=round(scores_f1.std(),4) resDict['Array-f1-Scores'][fileName]=(scores_f1) resDict['Array-Acc-Scores'][fileName]=(scores_acc) resDict['bestML-f1'][fileName]=(str(bestEst_f1)) resDict['bestML-Acc'][fileName]=(str(bestEst_acc)) #ORIG # Acc,Acc_SD,f1,f1_SD = CV_multi_stats(X, y, bestEst,n=15) # resDict['Accuracy'][fileName]=round(Acc,4) # resDict['Accuracy_SD'][fileName]=round(Acc_SD,4) # resDict['f1 score'][fileName]=round(f1,4) # resDict['f1_SD'][fileName]=round(f1_SD,4) # resDict['Best (f1) Model parameters'][fileName]= bestEst print() # print(fileName," Done") print("Saving results to file") resDict.to_csv("OutputData.tsv", sep=',')