def RFE_selector(estimator, n_features_to_select, X_data, Y_data): columns = X_data.columns selector = RFE(estimator = estimator, n_features_to_select = n_features_to_select) selector.fit_transform(X_data, Y_data) labels = [columns[x] for x in selector.get_support(indices=True)] feature = pd.DataFrame(selector.fit_transform(X_data, Y_data), columns=labels) return feature
def ref1(X, y, features_name): estimator = LinearSVC(random_state=1) selector = RFE(estimator=estimator, n_features_to_select=1) selector.fit_transform(X, y) result1 = sorted( zip(map(lambda x: round(x, 4), selector.ranking_), features_name[:])) return [x[1] for x in result1]
def rfe(self, frame): if frame[self.class_col].dtype == "object": frame[self.class_col] = frame[self.class_col].astype('category') frame[self.class_col] = frame[self.class_col].cat.codes Y = frame[[self.class_col]] Y = Y.fillna(0) X = frame.drop(columns=[self.class_col]) nof_list = np.arange(1, len(X.columns) + 1) high_score = 0 nof = 0 score_list = [] for n in range(len(nof_list)): X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0) model = LinearRegression() rfe = RFE(model, n_features_to_select=nof_list[n]) X_train_rfe = rfe.fit_transform(X_train, y_train.values.ravel()) X_test_rfe = rfe.transform(X_test) model.fit(X_train_rfe, y_train) score = model.score(X_test_rfe, y_test) score_list.append(score) if score > high_score: high_score = score nof = nof_list[n] cols = list(X.columns) model = LinearRegression() rfe = RFE(model, n_features_to_select=nof) X_rfe = rfe.fit_transform(X, Y.values.ravel()) model.fit(X_rfe, Y.values.ravel()) temp = pd.Series(rfe.support_, index=cols) selected_features_rfe = list(temp[temp == True].index) return selected_features_rfe
def Recursive_Feature_Elimination(self, X_train, X_test, y_train, y_test, x, y, file_name = 'model.sav'): nof_list = np.arange(1, len(x.columns)) high_score=0 nof=0 score_list =[] for n in range(len(nof_list)): model = LinearRegression() rfe = RFE(model, nof_list[n]) X_train_rfe = rfe.fit_transform(X_train, y_train) X_test_rfe = rfe.transform(X_test) model.fit(X_train_rfe, y_train) score = model.score(X_test_rfe, y_test) score_list.append(score) if(score>high_score): high_score = score nof = nof_list[n] print("Optimum number of features: %d with score: %f" % (nof, high_score)) cols = list(x.columns) model = LinearRegression() rfe = RFE(model, nof) X_rfe = rfe.fit_transform(x,y) model.fit(X_rfe,y) temp = pd.Series(rfe.support_,index = cols) selected_features_rfe = temp[temp==True].index pickle.dump(model, open(file_name, 'wb')) with open('parameters_selection.txt', 'w') as f: for item in selected_features_rfe: f.write("%s\n" % item) return selected_features_rfe
def clustering_rfp(cluster_range, RFE_component_diabetes, dataset, dir): df = dataset.data x = (df.iloc[:, 0:-1]) y = (df.iloc[:, -1]) y = y.astype('int') x = StandardScaler().fit_transform(x) global diabetes_rp, x_rp, diabetes_dataset_rp, diabetes_dataset_rp NN_RFE_accuracy = defaultdict(dict) estimator = SVR(kernel="linear") kmeans_accuracy_RFE = defaultdict(dict) kmeans_time_RFE = defaultdict(dict) em_accuracy_RFE = defaultdict(dict) em_time_RFE = defaultdict(dict) for RFE_comp in RFE_component_diabetes: diabetes_data_RFE = RFE(estimator, n_features_to_select=RFE_comp) diabetes_data_RFE_data = diabetes_data_RFE.fit_transform(x, y) diabetes_data_RFE_df = pd.DataFrame(data=diabetes_data_RFE_data) diabetes_rp = RFE(estimator, n_features_to_select=RFE_comp) x_rp = diabetes_rp.fit_transform(x, y) diabetes_dataset_rp = dataset diabetes_dataset_rp.x = x_rp diabetes_dataset_rp.y = y for cluster in cluster_range: # Kmean start = datetime.now() myk_mean_RFE_prediction = KMeans( n_clusters=cluster, random_state=0).fit_predict(diabetes_data_RFE_df) kmeans_accuracy_for_k = common_utils.get_cluster_accuracy( y, myk_mean_RFE_prediction) end = datetime.now() kmeans_accuracy_RFE[RFE_comp][cluster] = kmeans_accuracy_for_k kmeans_time_RFE[RFE_comp][cluster] = (end - start).total_seconds() # EM start = datetime.now() em_pca_prediction_y = GaussianMixture(n_components=cluster).fit( diabetes_data_RFE_df).predict(diabetes_data_RFE_df) em_pca_accuracy_for_k = common_utils.get_cluster_accuracy( y, em_pca_prediction_y) end = datetime.now() em_accuracy_RFE[RFE_comp][cluster] = em_pca_accuracy_for_k em_time_RFE[RFE_comp][cluster] = (end - start).total_seconds() NN_RFE_accuracy[RFE_comp] = nn_experiment(diabetes_dataset_rp) common_utils.plot_feature_transformation_time( kmeans_time_RFE, "k-means RFE clusters vs time", dir) common_utils.plot_feature_transformation_accuracy( kmeans_accuracy_RFE, "k-means RFE clusters vs accuracy", dir) common_utils.plot_feature_transformation_time(em_time_RFE, "EM RFE clusters vs time", dir) common_utils.plot_feature_transformation_accuracy( em_accuracy_RFE, "EM RFE clusters vs accuracy", dir)
def RFE(): from sklearn.feature_selection import RFE model = LinearRegression(X, y) #Initializing RFE model rfe = RFE(model, 5) #Transforming data using RFE X_rfe = rfe.fit_transform(X, y) #Fitting the data to model model.fit(X_rfe, y) print(rfe.support_) print(rfe.ranking_) #no of features nof_list = np.arange(1, 13) high_score = 0 #Variable to store the optimum features nof = 0 score_list = [] for n in range(len(nof_list)): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100) model = LinearRegression() rfe = RFE(model, nof_list[n]) X_train_rfe = rfe.fit_transform(X_train, y_train) X_test_rfe = rfe.transform(X_test) model.fit(X_train_rfe, y_train) score = model.score(X_test_rfe, y_test) score_list.append(score) if (score > high_score): high_score = score nof = nof_list[n] print("Optimum number of features: %d" % nof) print("Score with %d features: %f" % (nof, high_score))
def rfe(predictors, target, number_of_features): ''' This function takes in predictors(features), a target variable and the number of top features we want and returns the top features that lead to the best performing linear regression model. ''' #Initialize the linear regression object lm = LinearRegression() #Initialize the RFE object, #setting the hyperparameters to be our linear regression #(as the algorithm to test the features on) #and the number of features to be returned rfe = RFE(lm, number_of_features) #Fit the RFE object to our data. #(This means create multiple linear regression models, #find the one that performs best, #and identify the predictors that are used in that model. #Those are the features we want.) #Transform our X dataframe to include only #the 'number_of_features' that performed the best rfe.fit_transform(predictors, target) #Create a mask to hold a list of the features that were selected or not mask = rfe.support_ #We get a list of the feature names selected from #X_train using .loc with our mask, #using .columns to get the column names, #and convert the values to a list using .tolist() X_reduced_scaled_rfe = predictors.iloc[:, mask].columns.tolist() return X_reduced_scaled_rfe
def FeatureSelectTune(): f_size = len(features_list) results = [] for i in range(f_size-1, 1, -1): print(i) clf = AdaBoostClassifier(learning_rate=0.3, n_estimators=100, random_state=22) scan = RFE(estimator=clf, n_features_to_select=i) scan.fit_transform(features_train, labels_train) new_feature_zip = zip(features_list[1:], scan.ranking_) new_features_list = ['poi'] + \ [tup[0] for tup in new_feature_zip if tup[1] == 1] pprint.pprint(new_features_list) clf = scan.estimator_ pred = test_classifier(clf, my_dataset, new_features_list) if pred: results.append((i, pred[2])) if new_feature_zip: pprint.pprint(new_feature_zip) pprint.pprint(results)
def rfe_function(self, df): k = self.k X = df.iloc[:, :-1] y = df.iloc[:, -1] lr = Ridge(alpha=100000, fit_intercept=True, normalize=True, copy_X=True, max_iter=1500, tol=1e-4, solver='auto') rfe = RFE(estimator=lr, n_features_to_select=k) rfe.fit_transform(X, y) ranking = sorted(zip(rfe.ranking_, X.columns.to_list()), reverse=True)[:k] co_list = [] for i in range(k): co_list.append(ranking[i][1]) print('Columns after selections are', co_list) return df[co_list]
def select_rfe(X, y, k): lm = LinearRegression() rfe = RFE(lm, k) rfe.fit_transform(X, y) mask = rfe.support_ rfe_features = X.loc[:, mask].columns.tolist() return rfe_features
def ref5(X, y, features_name): estimator = ComplementNB() selector = RFE(estimator=estimator, n_features_to_select=1) MinMax = MinMaxScaler() X = MinMax.fit_transform(X) selector.fit_transform(X, y) result5 = sorted( zip(map(lambda x: round(x, 4), selector.ranking_), features_name[:])) return [x[1] for x in result5]
def feature_selection(df, target): convert_dct = {'integer': 'int64', 'string': 'object', 'float': 'float64', 'boolean': 'bool', 'date-iso-8601': 'datetime64[ns]', 'date-eu': 'datetime64[ns]', 'date-non-std-subtype': 'datetime64[ns]', 'date-non-std': 'datetime64[ns]', 'gender': 'category', 'all-identical': 'category'} ptype = Ptype() ptype.run_inference(df) predicted = ptype.predicted_types count_normal_vars = 0 count_continuous_vars = 0 features = [] for key in predicted: # print(key, predicted[key]) if predicted[key] == 'int' or predicted[key] == 'float': features.append(key) x = df.loc[:, features].values x = StandardScaler().fit_transform(x) x = pd.DataFrame(x) x.columns = features X = x.drop(target, 1) # Feature Matrix y = x[target] # Target Variable # no of features nof_list = np.arange(1, len(features)) high_score = 0 # Variable to store the optimum features nof = 0 score_list = [] for n in range(len(nof_list)): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) model = LinearRegression() rfe = RFE(model, nof_list[n]) X_train_rfe = rfe.fit_transform(X_train, y_train) X_test_rfe = rfe.transform(X_test) model.fit(X_train_rfe, y_train) score = model.score(X_test_rfe, y_test) score_list.append(score) if (score > high_score): high_score = score nof = nof_list[n] # print("Optimum number of features: %d" % nof) # print("Score with %d features: %f" % (nof, high_score)) cols = list(X.columns) model = LinearRegression() # Initializing RFE model rfe = RFE(model, nof) # Transforming data using RFE X_rfe = rfe.fit_transform(X, y) # Fitting the data to model model.fit(X_rfe, y) temp = pd.Series(rfe.support_, index=cols) selected_features_rfe = temp[temp == True].index quality_measure = nof/len(features) return quality_measure
def extract_feature(): data = pd.read_excel('fuck1') feature = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 18] x = data[feature] y = data[[1]] rfe = RFE(estimator=LinearRegression(), n_features_to_select=2) rfe.fit_transform(x, y) print(rfe.support_) print(rfe.ranking_)
def lin_svc_dir(finC_x, finC_y, finT_x, finT_y): K = 10 kf = KFold(n_splits=K) best_acc = [] for num in features_num: print('selected num of features: ', num) dataC_x_train, dataC_x_test, dataC_y_train, dataC_y_test = train_test_split(finC_x, finC_y, test_size=0.1) dataT_x_train, dataT_x_test, dataT_y_train, dataT_y_test = train_test_split(finT_x, finT_y, test_size=0.1) estimator_c = LinearSVC() selector_c = RFE(estimator_c, num, step=0.1) new_x_c = selector_c.fit_transform(dataC_x_train, np.ravel(dataC_y_train)) estimator_t = LinearSVC() selector_t = RFE(estimator_t, num, step=0.1) new_x_t = selector_t.fit_transform(dataT_x_train, np.ravel(dataT_y_train)) new_x = pd.concat([pd.DataFrame(new_x_c), pd.DataFrame(new_x_t)], axis = 1) cv_accur = 0 cv_sd = 0 accur_total = 0 accur_list = [] for train_index, test_index in kf.split(new_x): data_x_train, data_x_test = new_x.values[train_index], new_x.values[test_index] data_y_train, data_y_test = finC_y.values[train_index], finC_y.values[test_index] data_y_train = np.ravel(data_y_train) data_y_test = np.ravel(data_y_test) accur = np.zeros(num_costs) for i in range(num_costs): model = LinearSVC(C = cost_range[i]) model.fit(data_x_train, data_y_train) pred = model.predict(data_x_test) accur[i] = accuracy_score(data_y_test, pred) accur_total += np.max(accur) accur_list.append(np.max(accur)) cv_accur = accur_total/K cv_sd = np.std(accur_list) print('Accuracy = ', cv_accur, 'std = ', cv_sd) best_acc.append(cv_accur) return best_acc
def rfe_select(self): # RFE循环特征选取 svc = LinearSVC() # 用线性核SVC也可以用其它线性分类器,若对于回归问题需要采用回归器 rfe = RFE(estimator=svc, n_features_to_select=self.select_feature_num) rfe.fit_transform(self.X_std, self.y) features = dict(zip(self.feature_names, rfe.ranking_)) # 或者可以通过 rfe.get_support()直接返回选择后的特征 # print(features) features = list(dict(sorted(features.items(), key=lambda d: d[1])).keys())[:self.select_feature_num] return features
def ref_(file): start1 = time.time() dataset = pd.read_csv(file,engine='python').dropna(axis=1) features_name = dataset.columns.values.tolist() dataset = np.array(dataset) X = dataset[:, 1:] y = dataset[:, 0] scaler = MinMaxScaler() X = scaler.fit_transform(X) # estimator = LinearSVC(random_state=1) # selector = RFE(estimator=estimator, n_features_to_select=1) # selector.fit(X, y) # # #print(list(zip(map(lambda x: round(x, 4), selector.ranking_), features_name[1:]))) # #result = sorted(result, key=lambda x: x[1], reverse=True) # result1 = sorted(zip(map(lambda x: round(x, 4), selector.ranking_), features_name[1:])) # # print(time.time()-start1) # start1 = time.time() # estimator = LogisticRegression(random_state=1) # selector = RFE(estimator=estimator, n_features_to_select=1) # selector.fit_transform(X, y) # result2 = sorted(zip(map(lambda x: round(x, 4), selector.ranking_), features_name[1:])) # print(time.time()-start1) # start1 = time.time() # estimator = RandomForestClassifier(random_state=1) # selector = RFE(estimator=estimator, n_features_to_select=1) # selector.fit_transform(X, y) # result3 = sorted(zip(map(lambda x: round(x, 4), selector.ranking_), features_name[1:])) # print(time.time()-start1) # start1 = time.time() # estimator = GradientBoostingClassifier(random_state=1) # selector = RFE(estimator=estimator, n_features_to_select=1) # selector.fit_transform(X, y) # result4= sorted(zip(map(lambda x: round(x, 4), selector.ranking_), features_name[1:])) # print(time.time()-start1) # start1 = time.time() result1 = [] result2 = [] result3 = [] result4 = [] estimator = ComplementNB() selector = RFE(estimator=estimator, n_features_to_select=1) selector.fit_transform(X, y) result5 = sorted(zip(map(lambda x: round(x, 4), selector.ranking_), features_name[1:])) #print(time.time()-start1) return ([x[1] for x in result1], [x[1] for x in result2], [x[1] for x in result3], [x[1] for x in result4], [x[1] for x in result5],)
def choice_feature_nums(data_x, data_y, col_name): n = len(col_name) dic = {} for i in range(3, n + 1): rfe = RFE(estimator=LinearRegression, n_features_to_select=i) rfe.fit_transform(data_x, data_y) dic[i] = rfe.score() plt.xlabel('feature_num') plt.ylabel('score') plt.plot(dic.keys(), dic.values()) plt.show() return dic
def fit(self, X, y): ''' Inputs: ------- X: a dataframe y: a series ''' # model = LinearRegression() # #Initializing RFE model # rfe = RFE(model, 7) # #Transforming data using RFE # X_rfe = rfe.fit_transform(X, y) # #Fitting the data to model # model.fit(X_rfe,y) # no of features nof_list=np.arange(1, X.shape[1]) high_score = 0 #Variable to store the optimum features nof = 0 score_list = [] for n in range(len(nof_list)): X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=self.test_size) model = LinearRegression() rfe = RFE(model,nof_list[n]) X_train_rfe = rfe.fit_transform(X_train,y_train) X_test_rfe = rfe.transform(X_test) model.fit(X_train_rfe,y_train) score = model.score(X_test_rfe,y_test) score_list.append(score) if(score>high_score): high_score = score nof = nof_list[n] print("Optimum number of features: %d" %nof) print("Score with %d features: %f" % (nof, high_score)) cols = list(X.columns) model = LinearRegression() #Initializing RFE model rfe = RFE(model, nof) #Transforming data using RFE X_rfe = rfe.fit_transform(X,y) #Fitting the data to model model.fit(X_rfe,y) temp = pd.Series(rfe.support_,index = cols) selected_features_rfe = temp[temp==True].index self.relevant_features = selected_features_rfe.values pass
def rfe_function(data,y_col,k): X = data[data.columns.difference([y_col])] y = data[y_col].astype('float') lr = Ridge(alpha=100000, fit_intercept=True, normalize=True, copy_X=True, max_iter=1500, tol=1e-4, solver='auto') rfe = RFE(estimator=lr, n_features_to_select=k) rfe.fit_transform(X, y) ranking = sorted(zip(rfe.ranking_,X.columns.to_list()), reverse=True)[:k] co_list = [] for i in range(k): co_list.append(ranking[i][1]) return co_list
def get_selected_features(data, target, method="rfe", n_components=5, threshold=0.1): if method == "rfe": estimator = DecisionTreeClassifier() selector = RFE(estimator, n_components, step=1) result = selector.fit_transform(data, target) elif method == "vt": selector = VarianceThreshold(threshold) result = selector.fit_transform(data) else: result = SelectKBest(chi2, k=n_components).fit_transform(data, target) return pd.DataFrame(result)
def mlp_dir2(finC_x, finC_y, finT_x, finT_y): K = 10 kf = KFold(n_splits=K) accur = [] for num in features_num: dataC_x_train, dataC_x_test, dataC_y_train, dataC_y_test = train_test_split( finC_x, finC_y, test_size=0.1) dataT_x_train, dataT_x_test, dataT_y_train, dataT_y_test = train_test_split( finT_x, finT_y, test_size=0.1) estimator_c = LinearSVC() selector_c = RFE(estimator_c, num, step=0.1) new_x_c = selector_c.fit_transform(dataC_x_train, np.ravel(dataC_y_train)) estimator_t = LinearSVC() selector_t = RFE(estimator_t, num, step=0.1) new_x_t = selector_t.fit_transform(dataT_x_train, np.ravel(dataT_y_train)) new_x = pd.concat([pd.DataFrame(new_x_c), pd.DataFrame(new_x_t)], axis=1) print('selected num of features: ', num) data_x_train, data_x_test, data_y_train, data_y_test = train_test_split( new_x, dataC_y_train, test_size=0.1) mlp = MLPClassifier(hidden_layer_sizes=(60, ), activation='logistic', solver='lbfgs', learning_rate_init=0.0001, max_iter=1500, alpha=0.001) mlp.fit(data_x_train, np.ravel(data_y_train)) y_pred = mlp.predict(data_x_test) accur.append(accuracy_score(data_y_test, y_pred)) #print('Accuracy: ', accuracy_score(data_y_test, y_pred), 'Loss: ', mlp.loss_) #print(confusion_matrix(data_y_test, y_pred)) print(classification_report(data_y_test, y_pred)) return accur
def recursiveFeatureSelector(classifier_model,train_data,train_labels,test_data,number_of_features): rfe = RFE(classifier_model,number_of_features) transformed_train_data = rfe.fit_transform(train_data,train_labels) transformed_test_data = rfe.transform(test_data) return transformed_train_data,transformed_test_data
def run_once(df_feature, df_label): df_ohe = dynamic_get_dummies(df_feature) train_x, test_x, train_y, test_y = train_test_split(df_ohe, df_label, test_size=0.3, random_state=99) # dbg_recover = pd.concat([test_x, test_y], axis=1) # over sample train_x, train_y = over_sample(train_x, train_y) # build model lg_regression = linear_model.LogisticRegression(solver='lbfgs') rfe = RFE(lg_regression, best_nof_feature) rfe_train_x = rfe.fit_transform(train_x, train_y) rfe_test_x = rfe.transform(test_x) lg_regression.fit(rfe_train_x, train_y) labels = df_label.unique() # predict probs test_y_predict_probs = lg_regression.predict_proba(rfe_test_x) test_y_predict_prob = test_y_predict_probs[:, 1] prob_df = pd.DataFrame(test_y_predict_prob) prob_df['predict'] = np.where(prob_df[0] >= lg_threshold, 1, 0) get_accuracy("logistic regression predict_probs", test_y, prob_df['predict'], labels) # print features cols = list(df_ohe.columns) temp = pd.Series(rfe.support_, index=cols) selected_features_rfe = temp[temp == True].index save_print("Top " + str(best_nof_feature) + " features are: ") save_print(selected_features_rfe) # dump model joblib.dump(lg_regression, root_folder + "lg_regression.pkl") save_print("lg_regression Model dumped!") joblib.dump(selected_features_rfe, root_folder + "lg_regression_cols.pkl") save_print("lg_regression models columns dumped!")
def run_once(df_feature, df_label): df_ohe = dynamic_get_dummies(df_feature) train_x, test_x, train_y, test_y = train_test_split(df_ohe, df_label, test_size=0.3, random_state=99) # dbg_recover = pd.concat([test_x, test_y], axis=1) # over sample train_x, train_y = over_sample(train_x, train_y) # build model dc_tree = DecisionTreeClassifier(criterion='entropy', min_samples_split=20, random_state=99) rfe = RFE(dc_tree, best_nof_feature) rfe_train_x = rfe.fit_transform(train_x, train_y) rfe_test_x = rfe.transform(test_x) dc_tree.fit(rfe_train_x, train_y) labels = df_label.unique() # predict test_y_predict = dc_tree.predict(rfe_test_x) get_accuracy("decision tree", test_y, test_y_predict, labels) # print features cols = list(df_ohe.columns) temp = pd.Series(rfe.support_, index=cols) selected_features_rfe = temp[temp == True].index save_print("Top " + str(best_nof_feature) + " features are: ") save_print(selected_features_rfe) # dump model joblib.dump(dc_tree, root_folder + "dc_tree.pkl") save_print("dc_tree Model dumped!") joblib.dump(selected_features_rfe, root_folder + "dc_tree_cols.pkl") save_print("dc_tree models columns dumped!")
def optimal_number_of_features(X_train, y_train, X_test, y_test): ''' optimal_number_of_features(X_train, y_train, X_test, y_test) RETURNS: number_of_features discover the optimal number of features, n, using our scaled x and y dataframes, recursive feature elimination and linear regression (to test the performance with each number of features). We will use the output of this function (the number of features) as input to the next function optimal_features, which will then run recursive feature elimination to find the n best features Shamelessly stolen from David Espinola ''' number_of_attributes = X_train.shape[1] number_of_features_list = np.arange(1, number_of_attributes) high_score = 0 #Variable to store the optimum features number_of_features = 0 score_list = [] for n in range(len(number_of_features_list)): model = LinearRegression() rfe = RFE(model, number_of_features_list[n]) X_train_rfe = rfe.fit_transform(X_train, y_train) X_test_rfe = rfe.transform(X_test) model.fit(X_train_rfe, y_train) score = model.score(X_test_rfe, y_test) score_list.append(score) if (score > high_score): high_score = score number_of_features = number_of_features_list[n] return number_of_features
def run_rfe(df_feature, df_label): df_ohe = dynamic_get_dummies(df_feature) train_x, test_x, train_y, test_y = train_test_split(df_ohe, df_label, test_size=0.3, random_state=99) # dbg_recover = pd.concat([test_x, test_y], axis=1) # over sample train_x, train_y = over_sample(train_x, train_y) # build model nof_list = np.arange(1, (max_feature_try_numbers + 1)) class_1_precision_list = [] class_1_recall_list = [] for n in range(len(nof_list)): save_print("********Current nof features are: " + str(nof_list[n])) dc_tree = DecisionTreeClassifier(criterion='entropy', min_samples_split=20, random_state=99) rfe = RFE(dc_tree, nof_list[n]) rfe_train_x = rfe.fit_transform(train_x, train_y) rfe_test_x = rfe.transform(test_x) dc_tree.fit(rfe_train_x, train_y) labels = df_label.unique() # predict test_y_predict = dc_tree.predict(rfe_test_x) class_1_precision, class_1_recall = get_accuracy( "decision tree", test_y, test_y_predict, labels) class_1_precision_list.append(class_1_precision) class_1_recall_list.append(class_1_recall) plot_pre_recall(nof_list, class_1_precision_list, class_1_recall_list, 'decision tree')
def optimal_features(X_train, y_train, number_of_features): ''' optimal_features(X_train, y_train, number_of_features) RETURNS: selected_features_rfe Taking the output of optimal_number_of_features, as n, and use that value to run recursive feature elimination to find the n best features Shamelessly stolen from David Espinola ''' cols = list(X_train.columns) model = LinearRegression() #Initializing RFE model rfe = RFE(model, number_of_features) #Transforming data using RFE X_rfe = rfe.fit_transform(X_train, y_train) #Fitting the data to model model.fit(X_rfe, y_train) temp = pd.Series(rfe.support_, index=cols) selected_features_rfe = temp[temp == True].index return selected_features_rfe
def run_rfe(df_feature, df_label): df_ohe = dynamic_get_dummies(df_feature) train_x, test_x, train_y, test_y = train_test_split(df_ohe, df_label, test_size=0.3, random_state=99) # dbg_recover = pd.concat([test_x, test_y], axis=1) # over sample train_x, train_y = over_sample(train_x, train_y) # build model nof_list = np.arange(1, (max_feature_try_numbers + 1)) class_1_precision_list = [] class_1_recall_list = [] for n in range(len(nof_list)): save_print("********Current nof features are: " + str(nof_list[n])) lg_regression = linear_model.LogisticRegression(solver='lbfgs') rfe = RFE(lg_regression, nof_list[n]) rfe_train_x = rfe.fit_transform(train_x, train_y) rfe_test_x = rfe.transform(test_x) lg_regression.fit(rfe_train_x, train_y) labels = df_label.unique() # predict probs test_y_predict_probs = lg_regression.predict_proba(rfe_test_x) test_y_predict_prob = test_y_predict_probs[:, 1] prob_df = pd.DataFrame(test_y_predict_prob) prob_df['predict'] = np.where(prob_df[0] >= lg_threshold, 1, 0) class_1_precision, class_1_recall = get_accuracy( "logistic regression predict_probs", test_y, prob_df['predict'], labels) class_1_precision_list.append(class_1_precision) class_1_recall_list.append(class_1_recall) plot_pre_recall(nof_list, class_1_precision_list, class_1_recall_list, 'logistic regression')
def selectKBestFeatures( num_of_features, features_array, class_assignment): features_array = np.array(features_array) features_array = features_array.astype(float) class_assignment = np.array(class_assignment) class_assignment = class_assignment.astype(float) estimator = SVR(kernel="linear") selectorRFE = RFE(estimator, num_of_features, step=1) selected_features = selectorRFE.fit_transform(features_array, class_assignment) # selectorCHI = SelectKBest(chi2, k=num_of_features) # selected_features = selectorCHI.fit_transform(features_array, class_assignment) print("first sel", selectorRFE) print("RFE selector", len(selected_features[0])) print("selected features", selectorRFE.ranking_) return selected_features
def optimal_number_of_features(X, y): '''discover the optimal number of features, n, using our scaled x and y dataframes, recursive feature elimination and linear regression (to test the performance with each number of features). We will use the output of this function (the number of features) as input to the next function optimal_features, which will then run recursive feature elimination to find the n best features ''' number_of_attributes = X_train.shape[1] number_of_features_list = np.arange( 1, number_of_attributes) # len(features_range) # set "high score" to be the lowest possible score high_score = 0 # variables to store the feature list and number of features number_of_features = 0 score_list = [] for n in range(len(number_of_features_list)): model = LinearRegression() rfe = RFE(model, number_of_features_list[n]) X_train_rfe = rfe.fit_transform(X_train, y_train) X_test_rfe = rfe.transform(X_test) model.fit(X_train_rfe, y_train) score = model.score(X_test_rfe, y_test) score_list.append(score) if (score > high_score): high_score = score number_of_features = number_of_features_list[n] return number_of_features
def RFE_nof(df, target, normalize): y = df[target] X = df.drop(target, 1) nof_list = np.arange(1, len(X.columns)) high_score = 0 #Variable to store the optimum features nof = 0 score_list = [] for n in range(len(nof_list)): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) model = LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=normalize) rfe = RFE(model, nof_list[n]) X_train_rfe = rfe.fit_transform(X_train, y_train) X_test_rfe = rfe.transform(X_test) model.fit(X_train_rfe, y_train) score = model.score(X_test_rfe, y_test) score_list.append(score) if (score > high_score): high_score = score nof = nof_list[n] return nof
class LogReg: """ Initialization sets the objects model, vectorizer, labels, and corpus variables. Initialization also performs the initial training for the model and vectorizer using the given reviews. """ def __init__( self, reviews, vectorizer = TfidfVectorizer(stop_words = 'english', max_df = 1, ngram_range = (1, 2)), model = LogisticRegression() ): self.model = model self.vectorizer = vectorizer self.selector = RFE(self.model, step = 100, verbose = 100) corpus = [] labels = [] for review in reviews: corpus += [review[1]["text"]] labels += [review[0]] #setting variables for the object self.corpus = corpus self.labels = labels self.reviews = reviews X = self.vectorizer.fit_transform(self.corpus) self.feature_names = self.vectorizer.get_feature_names() y = self.labels for string in self.feature_names: print(string.encode("ascii", 'ignore')) #Training the model X_new = self.selector.fit_transform(X, self.labels) self.model.fit(X_new, self.labels) def classify_all(self, all_test_data): test_corpus = [] y = [] for review in all_test_data: test_corpus += [review[1]['text']] y += [review[0]] #Used transform instead of fit_transform #for test data so number of features will match X = self.vectorizer.transform(test_corpus) X_new = self.selector.transform(X) results = self.model.predict(X_new) categories = ["spring", "summer", "fall", "winter"] for i, category in enumerate(categories): top10 = np.argsort(self.model.coef_[i])[-20:] for j in top10: print("%s: %s" % (category, "".join(self.feature_names[j]))) return results
def feat3(matrix): last_column = [row[len(matrix[0])-1] for row in matrix] data_class = transform_to_int(last_column, matrix[0][len(matrix[0])-1]) indices = list(range(len(matrix[0])-1)) new_list = map(operator.itemgetter(*indices), matrix) data = np.asarray(new_list) data = data.astype(np.float) svc = SVC(kernel="linear", C=1) rfe = RFE(estimator=svc, n_features_to_select=5, step=1) matrix_new = rfe.fit_transform(data, data_class) data_class = np.array([data_class]) features_selected = np.concatenate((matrix_new,data_class.T),axis=1) indices_resultados = rfe.get_support(new_list) features = [] for data in indices_resultados: features.append(data) return features
def train_logistic_regression( feats = None, labels = [], feature_selector=SelectFpr(chi2, alpha=0.05), # Use None to stop feature selection cv=5, # Number of folds used in cross-validation priorlims=np.arange(.1, 3.1, .1), feature_elim = True): # regularization priors to explore (we expect something around 1) # Map the count dictionaries to a sparse feature matrix: vectorizer = DictVectorizer(sparse=False) feats = vectorizer.fit_transform(feats) ##### FEATURE SELECTION feat_matrix = feats feature_selector = None if feature_elim == True: feature_selector = RFE(estimator=LogisticRegression(), n_features_to_select=None, step=1, verbose=0) feat_matrix = feature_selector.fit_transform(feats, labels) ##### HYPER-PARAMETER SEARCH # Define the basic model to use for parameter search: searchmod = LogisticRegression(fit_intercept=True, intercept_scaling=1, verbose=1, solver='lbfgs', max_iter=2000) # Parameters to grid-search over: parameters = {'C':priorlims, 'penalty':['l1', 'l2'], 'multi_class':['multinomial', 'ovr']} # Cross-validation grid search to find the best hyper-parameters: clf = GridSearchCV(searchmod, parameters, cv=cv) clf.fit(feat_matrix, labels) params = clf.best_params_ # Establish the model we want using the parameters obtained from the search: mod = LogisticRegression(fit_intercept=True, intercept_scaling=1, C=params['C'], penalty=params['penalty'], multi_class=params['multi_class'], solver='lbfgs', verbose=1, max_iter=200) ##### ASSESSMENT scores = cross_val_score(mod, feat_matrix, labels, cv=cv, scoring="f1_macro") print 'Best model', mod print '%s features selected out of %s total' % (feat_matrix.shape[1], feats.shape[1]) print 'F1 mean: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std()*2) # TRAIN OUR MODEL: mod.fit(feat_matrix, labels) # Return the trained model along with the objects we need to # featurize test data in a way that aligns with our training # matrix: return (mod, vectorizer, feature_selector)
def train_NB( feats = None, labels = [], feature_selector=SelectFpr(chi2, alpha=0.05), # Use None to stop feature selection cv=5, # Number of folds used in cross-validation priorlims=np.arange(.1, 2.0, .5)): # alphas to explore (we expect something around 1) # Map the count dictionaries to a sparse feature matrix: vectorizer = DictVectorizer(sparse=False) feats = vectorizer.fit_transform(feats) ##### FEATURE SELECTION feat_matrix = feats feature_selector = RFE(estimator=MultinomialNB(), n_features_to_select=None, step=1, verbose=0) feat_matrix = feature_selector.fit_transform(feats, labels) ##### HYPER-PARAMETER SEARCH # Define the basic model to use for parameter search: searchmod = MultinomialNB() # Parameters to grid-search over: parameters = {'alpha':priorlims} # Cross-validation grid search to find the best hyper-parameters: clf = GridSearchCV(searchmod, parameters, cv=cv, n_jobs=-1) clf.fit(feat_matrix, labels) params = clf.best_params_ # Establish the model we want using the parameters obtained from the search: mod = MultinomialNB(alpha=params['alpha']) ##### ASSESSMENT scores = cross_val_score(mod, feat_matrix, labels, cv=cv, scoring="f1_macro") print 'Best model', mod print '%s features selected out of %s total' % (feat_matrix.shape[1], feats.shape[1]) print 'F1 mean: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std()*2) # TRAIN OUR MODEL: mod.fit(feat_matrix, labels) # Return the trained model along with the objects we need to # featurize test data in a way that aligns with our training # matrix: return (mod, vectorizer, feature_selector)
def train_DT( feats = None, labels = [], feature_selector=SelectFpr(chi2, alpha=0.05), # Use None to stop feature selection cv=5): # Number of folds used in cross-validation # Map the count dictionaries to a sparse feature matrix: vectorizer = DictVectorizer(sparse=False) feats = vectorizer.fit_transform(feats) ##### FEATURE SELECTION feat_matrix = feats feature_selector = RFE(estimator=MultinomialNB(), n_features_to_select=None, step=1, verbose=0) feat_matrix = feature_selector.fit_transform(feats, labels) ##### HYPER-PARAMETER SEARCH # Define the basic model to use for parameter search: searchmod = DecisionTreeClassifier() # Parameters to grid-search over: parameters = {'splitter':['best','random'],'max_features':['sqrt',0.25,'log2'],'min_samples_split':[2,5,10]} # Cross-validation grid search to find the best hyper-parameters: clf = GridSearchCV(searchmod, parameters, cv=cv, n_jobs=-1) clf.fit(feat_matrix, labels) params = clf.best_params_ # Establish the model we want using the parameters obtained from the search: mod = DecisionTreeClassifier(splitter=params['splitter'],max_features=params['max_features'],min_samples_split=params['min_samples_split']) ##### ASSESSMENT scores = cross_val_score(mod, feat_matrix, labels, cv=cv, scoring="f1_macro") print 'Best model', mod print '%s features selected out of %s total' % (feat_matrix.shape[1], feats.shape[1]) print 'F1 mean: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std()*2) # TRAIN OUR MODEL: mod.fit(feat_matrix, labels) # Return the trained model along with the objects we need to # featurize test data in a way that aligns with our training # matrix: return (mod, vectorizer, feature_selector)
# -*- coding: utf-8 -*- import pandas from sklearn.feature_selection import RFE from sklearn.linear_model import LinearRegression data = pandas.read_csv('D:\\PDM\\6.2\\data2.csv') feature = data[['月份', '季度', '广告费用', '客流量']] rfe = RFE( estimator=LinearRegression(), n_features_to_select=2 ) sFeature = rfe.fit_transform( feature, data['销售额'] ) rfe.get_support()
def featureSelection(parameter,numberOfFeatures): global l,newTrain,newTest,explained_train_var_ratio,explained_test_var_ratio,label,testFileList ###################################### if(parameter.upper()=='ASM'): trainFile=trainFileASM testFile=testFileASM else: trainFile=trainFileBYTE testFile=testFileBYTE #################################### f=open(trainFile) header=f.readline().split(',') length=len(header) for line in iter(f): token=line.split(',') l=len(token) token=[w.replace('\n','') for w in token] if(len(token)>length): continue; train.append(token[1:len(token)-1]) label.append(token[-1].replace('\n','')) f.close() f=open(testFile) f.readline() for line in iter(f): token=line.split(',') token=[w.replace('\n','') for w in token] testFileList.append(token[0]) test.append(token[1:]) #actualLabel.append(token[-1].replace('\n','')) f.close() model = LogisticRegression() rfe = RFE(model, int(numberOfFeatures)) X=np.array(train) Y=np.array(label) newTrain=rfe.fit_transform(X,Y) print("New Train : ") print(newTrain) print("get_support") support=rfe.support_ print(support) support=np.array(support) indexList=[] for i in range(len(support)): if support[i]: #print(str(i)+",") indexList.append(i) i+=1 #print(test) #print(support[2]) for sublist in test: tempList=[] #print(sublist) for index in indexList: tempList.append(sublist[index]) #print(entry) #print(str(sublist.index(entry))) #if support[int(sublist.index(entry))]==True: #tempList.append(str(sublist.index(entry))) #print(entry) #print(tempList) newTest.append(tempList) print("New Test") print(newTest) ###################################### if(parameter.upper()=='ASM'): writeTrainFile='RFE_TRAIN.ASM' writeTestFile='RFE_TEST.ASM' writeTrainLabel='RFE_LABEL.ASM' writeTestFileList='RFE_TESTFILELIST.ASM' else: writeTrainFile='RFE_TRAIN.BYTE' writeTestFile='RFE_TEST.BYTE' writeTrainLabel='RFE_LABEL.BYTE' writeTestFileList='RFE_TESTFILELIST.BYTE' #################################### print("***** Transormed train data *****") with open(writeTrainFile, 'w') as fp: a = csv.writer(fp, delimiter=',') a.writerows(newTrain) print("***** Transormed test data *****") with open(writeTestFile, 'w') as fp: a = csv.writer(fp, delimiter=',') a.writerows(newTest) print("******* Writting Train Labels *********") with open(writeTrainLabel, 'w') as fp: a = csv.writer(fp, delimiter=',') a.writerows(label) print("****** Writting Test File Names ******") fp=open(writeTestFileList, 'w') for entry in testFileList: fp.write(entry+"\n") fp.close()
def RFE(self,estimator,k): X=self.X Y=self.Y rfe=RFE(estimator,n_features_to_select=k) res=rfe.fit_transform(X,Y) return rfe,res
def featureSelection(): global l,newTrain,newTest,explained_train_var_ratio,explained_test_var_ratio,label,testFileList f=open(trainFile) header=f.readline().split(',') length=len(header) for line in iter(f): token=line.split(',') l=len(token) token=[w.replace('\n','') for w in token] if(len(token)>length): continue; train.append(token[1:len(token)-1]) label.append(token[-1].replace('\n','')) f.close() f=open(testFile) f.readline() for line in iter(f): token=line.split(',') token=[w.replace('\n','') for w in token] testFileList.append(token[0]) test.append(token[1:]) #actualLabel.append(token[-1].replace('\n','')) f.close() model = LogisticRegression() rfe = RFE(model, 1000) X=np.array(train) Y=np.array(label) newTrain=rfe.fit_transform(X,Y) print("New Train : ") print(newTrain) print("get_support") support=rfe.support_ print(support) support=np.array(support) indexList=[] for i in range(len(support)): if support[i]: #print(str(i)+",") indexList.append(i) i+=1 #print(test) #print(support[2]) for sublist in test: tempList=[] #print(sublist) for index in indexList: tempList.append(sublist[index]) #print(entry) #print(str(sublist.index(entry))) #if support[int(sublist.index(entry))]==True: #tempList.append(str(sublist.index(entry))) #print(entry) #print(tempList) newTest.append(tempList) print("New Test") print(newTest)
def train_classifier(self, src_filename, feature_function=None, feature_selector=None,#SelectFpr(chi2, alpha=0.05), # Use None to stop feature selection cv=8, # Number of folds used in cross-validation priorlims=np.arange(.1, 4.0, .3), #TODO: these are arbitrary numbers! use_rfe = False, param_search = False, print_model = False, aux_data = None, retrain = True): # regularization priors to explore (we expect something around 1) if feature_function is None: feature_function = self._unigram_ft_fn """ note: this differs from the class implementation in that you pass in the filename to read, not the reader itself. The advantage of this is that it is less annoying. The disadvantage is that it's less general in the case that you want to use different filetypes. @return: a tuple of ( mod - a trained model capable of prediction, vectorizer - an object to convert a nice Counter to a numeric feature vector, feature_selector - the feature selector to used on the training data/to use on the the test data, feature_function - the function used to featurize the trainging data/to use on the the test data ) meme - a (self replicating,) nongenetic cultural unit TODO: The following errors arrive from using too many cv folds: ValueError: zero-size array to reduction operation maximum which has no identity """ if retrain: self.model = None reader=util.binarized_transcript_reader(src_filename) # Featurize the data: feats, labels = self._featurizer(reader=reader, feature_function=feature_function) # Map the count dictionaries to a sparse feature matrix: vectorizer = DictVectorizer(sparse=True) #TODO this was false in the 224u code. No idea why. # X is a list of lists, each of shich have length of about 1000 X = vectorizer.fit_transform(feats) # Define the basic model to use for parameter search: searchmod = LogisticRegression(fit_intercept=True, intercept_scaling=1, solver = 'lbfgs') ##### FEATURE SELECTION # (An optional step; not always productive). By default, we select all # the features that pass the chi2 test of association with the # class labels at p < 0.05. sklearn.feature_selection has other # methods that are worth trying. I've seen particularly good results # with the model-based methods, which require some changes to the # current code. feat_matrix = None if use_rfe: feature_selector = RFE(estimator = searchmod, n_features_to_select=None, step=1, verbose=0) if feature_selector: feat_matrix = feature_selector.fit_transform(X, labels) else: feat_matrix = X if param_search: ##### HYPER-PARAMETER SEARCH # Parameters to grid-search over: parameters = {'C':priorlims, 'penalty':['l1','l2'], 'multi_class': ['ovr', 'multinomial']} # parameters = {'C':priorlims, 'penalty':['l1'], 'multi_class': ['ovr']} #TODO: actually take the time to search for good params # Cross-validation grid search to find the best hyper-parameters: clf = GridSearchCV(searchmod, parameters, cv=cv) # import pdb;pdb.set_trace() print "searching for optimal hyperparameters..." clf.fit(feat_matrix, labels) print "whew, done with that grid search" params = clf.best_params_ else: """Best model LogisticRegression(C=3.7000000000000006, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', penalty='l1', random_state=None, solver='liblinear', tol=0.0001, verbose=0)""" params = {'C':3.7, 'penalty':'l1', 'multi_class': 'ovr'} # Establish the model we want using the parameters obtained from the search: mod = LogisticRegression(fit_intercept=True, intercept_scaling=1, C=params['C'], penalty=params['penalty'], multi_class = params['multi_class'], solver = 'lbfgs') ##### ASSESSMENT # Cross-validation of our favored model; for other summaries, use different # values for scoring: http://scikit-learn.org/dev/modules/model_evaluation.html if print_model: scores = cross_val_score(mod, feat_matrix, labels, cv=cv, scoring="f1_macro") print 'Best model', mod print '%s features selected out of %s total' % (feat_matrix.shape[1], X.shape[1]) print 'F1 mean: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std()*2) # TRAIN OUR MODEL: print "training model..." mod.fit(feat_matrix, labels) print "done with training, yeah" # Return the trained model along with the objects we need to # featurize test data in a way that aligns with our training # matrix: self.model = (mod, vectorizer, feature_selector, feature_function)
def select_train_predict(X, Y, Z, feature_list, selection_method, estimator_method, n_features, selection_args, estimator_args): W = [] features = [] if selection_method != '2step_kbest': n_features = min(n_features, len(feature_list)) if estimator_method == 'svm' and selection_method == 'rfe': estimator_args['kernel'] = 'linear' estimator = ESTIMATORS[estimator_method](**estimator_args) if selection_method == 'cluster': agglom = FeatureAgglomeration(n_clusters=n_features, affinity='cosine', linkage='average') clusters = agglom.fit_predict(X).tolist() sample = [clusters.index(i) for i in range(n_features)] X = X[:,sample] Z = Z[:,sample] selection_method = None if selection_method is None: for i, y in enumerate(Y): estimator.fit(X, y) w = estimator.predict(Z) W.append(w) if (i+1) % (len(Y) / 10) == 0: print '.', if selection_method == 'rfe': selector = RFE(estimator=estimator, n_features_to_select=n_features, **selection_args) for i, y in enumerate(Y): selector = selector.fit(X, y) features.append(feature_list[selector.support_]) w = selector.predict(Z) W.append(w) if (i+1) % (len(Y) / 10) == 0: print '.', if selection_method == 'myrfe': selector = MyRFE(estimator=estimator, n_features=n_features, **selection_args) for i, y in enumerate(Y): selector.fit(X, y) features.append(feature_list[selector.support]) w = selector.predict(Z) W.append(w) if (i+1) % (len(Y) / 10) == 0: print '.', if selection_method == 'kbest': selector = SelectKBest(f_regression, k=n_features, **selection_args) for i, y in enumerate(Y): X2 = selector.fit_transform(X, y) Z2 = selector.transform(Z) features.append(feature_list[selector.get_support()]) estimator.fit(X2, y) w = estimator.predict(Z2) W.append(w) if (i+1) % (len(Y) / 10) == 0: print '.', print return W, features
# scores_f1 = cross_val_score(rf,X,y,n_jobs=-1,cv=StratifiedShuffleSplit(y,n_iter=10,test_size=0.22),scoring='f1') # print("X RF f1: %0.3f (+- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2)) # In[ ]: svc = LinearSVC(C=20, penalty='l1', dual=False) svc.fit(X, y) selected_feature_names = feature_cols[[list(set(np.where(svc.coef_ != 0)[-1]))]] X_svm = svc.transform(X) print("X_svm L1 transformed:", X_svm.shape) X=X_svm # In[ ]: rfeSelect = RFE(estimator=rf,n_features_to_select=10, step=0.15) X_RFE = rfeSelect.fit_transform(X,y) print(X_RFE.shape) # In[ ]: RFE_FeatureNames = feature_cols[rfeSelect.get_support()] print("RFE_FeatureNames: \n",RFE_FeatureNames) # In[ ]: "http://stackoverflow.com/questions/21548750/plotting-histograms-against-classes-in-pandas-matplotlib" for featName in RFE_FeatureNames: df.groupby("class").feature.hist(alpha=0.4) df.groupby("classname")[featName].plot(kind='kde')