def remove_one_feature(X, Y, names): lr = LinearRegression() rfe = RFE(lr, n_features_to_select=1) rfe.fit(X,Y) rank = (sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), names))) print(rank) return rank[-1][1]
def recursive_feature_elimination(config_learning, config_data, number_features): output = open(os.path.expanduser(config_data.get("Learner", "models")) + "/" + "feature_ranks.txt", "w") feature_names = FeatureExtractor.get_combinations_from_config_file_unsorted(config_data) x_train = read_features_file(config_learning.get('x_train'), '\t') y_train = read_reference_file(config_learning.get('y_train'), '\t') x_test = read_features_file(config_learning.get('x_test'), '\t') estimator, scorers = learn_model.set_learning_method(config_learning, x_train, y_train) scale = config_learning.get("scale", True) if scale: x_train, x_test = scale_datasets(x_train, x_test) rfe = RFE(estimator, number_features, step=1) rfe.fit(x_train, y_train) for i, name in enumerate(feature_names): output.write(name + "\t" + str(rfe.ranking_[i]) + "\n") print(name + "\t" + str(rfe.ranking_[i])) predictions = rfe.predict(x_test) output.close() return predictions
def doRFE(self, X, y): # do RFE self.numFeatures = X.shape[1] svc = SVC(kernel="linear", C=self.C) rfe = RFE(estimator=svc, n_features_to_select=1, step=1) rfe.fit(X, y) self.feature_importances_ = self._getImportances(rfe.ranking_)
def get_model_RFE_top_features(self,expression_file,ic50_file,target_features,drug): expression_frame,ic50_series = dfm.get_expression_frame_and_ic50_series_for_drug(expression_file, ic50_file,drug,normalized=True,trimmed=True,threshold=None) scikit_data,scikit_target = dfm.get_scikit_data_and_target(expression_frame,ic50_series) step_length = int(len(scikit_data.tolist()[0]) / 100) + 1 selector = RFE(self.model,int(target_features),step=step_length) selector.fit(scikit_data,scikit_target) return [expression_frame.index[i] for i in xrange(0,len(expression_frame.index)) if selector.support_[i]]
def recursiveFeatureSelection(): X = np.array(trainingData, dtype=float) y = np.array(trainingDataLabels, dtype=float) svc = SVC("linear", 1) rfe = RFE(svc, 1, 1) rfe.fit(X, y) print rfe
def featSelect(label,trainSet,trainObs,cv,numFeat=5,SEED=34,name=''): from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression from sklearn.metrics import roc_auc_score from numpy import zeros model = LogisticRegression(random_state=SEED) predCv = zeros(len(trainObs)) rfe = RFE(model, numFeat, step=1) rfe.fit(trainSet,trainObs) vars = list(trainSet.columns[rfe.ranking_ == 1]) auc = 0 for i in range(1,max(rfe.ranking_)): for tr, vl in cv: model.fit(trainSet[vars + list(trainSet.columns[rfe.ranking_ == i])].ix[tr],trainObs[tr]) predCv[vl] = model.predict_proba(trainSet[vars + list(trainSet.columns[rfe.ranking_ == i])].ix[vl])[:,1] if roc_auc_score(trainObs,predCv) > auc: auc = roc_auc_score(trainObs,predCv) vars += list(trainSet.columns[rfe.ranking_ == i]) for v in vars: for tr, vl in cv: model.fit(trainSet[[x for x in vars if x != v]].ix[tr],trainObs[tr]) predCv[vl] = model.predict_proba(trainSet[[x for x in vars if x != v]].ix[vl])[:,1] if roc_auc_score(trainObs,predCv) > auc: auc = roc_auc_score(trainObs,predCv) vars.remove(v) for v in [x for x in trainSet.columns if x not in vars]: for tr, vl in cv: model.fit(trainSet[vars + [v]].ix[tr],trainObs[tr]) predCv[vl] = model.predict_proba(trainSet[vars + [v]].ix[vl])[:,1] if roc_auc_score(trainObs,predCv) > auc: auc = roc_auc_score(trainObs,predCv) vars += [v] print name,"Final AUC: ",auc return {label: vars}
def build_model(x,y,no_features): """ Build a linear regression model """ model = LinearRegression(normalize=True,fit_intercept=True) rfe_model = RFE(estimator=model,n_features_to_select=no_features) rfe_model.fit(x,y) return rfe_model
def quick_rfe(estimator, X, y): rfe = RFE(estimator = estimator, n_features_to_select = 1) rfe.fit(X,y) features = X.columns.tolist() sorted_features = [f for (rank, f) in sorted(zip(rfe.ranking_, features))] return sorted_features, rfe.ranking_
def recurrciveFE(self, data): """ Uses Recurrcise Feature Elimination to determine the write number of features before adding additional leads to overfitting & It works by recursively removing attributes and building a model on those attributes that remain. It uses the model accuracy to identify which attributes (and combination of attributes) contribute the most to predicting the target attribute. Parameters ---------- data : DataFrame Input data, for which categorical variables should be converted response should be in 0 column, predictors in additional Returns ------- out : Plot A plot with the number of optimal number of features, which is then used to determine features of most importance returned in a print out to console """ features_list = data.columns.values[1::] predictors = np.asarray(data.values[:, 1::]) response = np.asarray(data.values[:, 0]) estimator = SVC(kernel="linear") ###using cross validation to determine nooffeatures rfecv = RFE(estimator, step=1, cv=StratifiedKFold(response, 2), scoring = 'accuracy') rfecv.fit(predictors, response) RFE( ) print("Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() ##label as optimal #of features noffeatures = rfecv.n_features_ ##use rfe to determine top features selector = RFE(estimator,noffeatures , step=1) selector = selector.fit(predictors, response) ##creat index to get names index1 = np.where(selector.support_ == False)[0] index = np.argsort(selector.ranking_[index1])[::-1] feature_list_imp = features_list[index] for f in range(index.shape[0]): print("%d. feature %d (%s)" % (f + 1, index[f], feature_list_imp[index[f]])) print(selector.support_) print(selector.ranking_)
def feature_selection(estimator, x, y): """ 支持度评级 """ selector = RFE(estimator) selector.fit(x, y) print('RFE selection') print(pd.DataFrame( {'support': selector.support_, 'ranking': selector.ranking_}, index=pig_three_feature.columns[1:]))
def feature_selection_RFE_draft(fn ,ax=None, sel="all", goal="Linebreak", isclass=True, verbosity=0, nf=7): X, y, names = data_prepare(fn, sel=sel, goal=goal, verbosity=verbosity-1) if verbosity > 1: print "names:", ",".join(names) # Create the RFE object and compute a cross-validated score. if isclass: #estimator = svm.SVC(kernel="linear",C=1.0) estimator = get_clf('svm') scoring = 'f1' cv = cross_validation.StratifiedKFold(y, 2) else: if False: from sklearn.ensemble import RandomForestRegressor if not hasattr(RandomForestRegressor,'coef_'): RandomForestRegressor.coef_ = property(lambda self:self.feature_importances_) estimator = RandomForestRegressor(n_estimators=100, max_depth=2, min_samples_leaf=2) else: estimator = linear_model.RidgeCV() scoring = 'mean_squared_error' cv = 3 # The "accuracy" scoring is proportional to the number of correct # classifications if True: rfecv = RFECV(estimator=estimator, step=1, cv=cv, scoring=scoring) else: from kgml.rfecv import RFECVp f_estimator = get_clf('svm') rfecv = RFECVp(estimator=estimator,f_estimator=f_estimator, step=1, cv=cv, scoring=scoring) with warnings.catch_warnings(): warnings.simplefilter("ignore") rfecv.fit(X, y) # Plot number of features VS. cross-validation scores ax.set_xlabel("Number of features selected") ax.set_ylabel("Cross validation score ({})".format(scoring)) ax.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) #print("Optimal number of features : %d" % rfecv.n_features_) best = names[rfecv.ranking_==1] rfe = RFE(estimator, n_features_to_select=1) rfe.fit(X,y) ranks = sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), names)) # reorder best using ranks best_set = set(best) best = [name for (i,name) in ranks if name in best_set] #print "The best features:", ', '.join(best) assert len(best) == len(best_set) return best, ranks
def subtest(model, XL, YL, XT, YT, feature_names): nfeatures = XL.shape[1] rfe = RFE(model, nfeatures-1) print "BEFORE" model.fit(XL, YL) print_performance(YT, model.predict(XT)) print "AFTER" rfe.fit(XL, YL) print_performance(YT, rfe.predict(XT)) print "REMOVED FEATURE %s" % (feature_names[np.where(rfe.support_==False)[0][0]]) print "" return rfe.transform(XL), rfe.transform(XT), feature_names[rfe.support_]
def test_rfe_2(): """Ensure that the TPOT RFE outputs the same result as the sklearn rfe when num_features>no. of features in the dataframe """ tpot_obj = TPOT() non_feature_columns = ['class', 'group', 'guess'] training_features = training_testing_data.loc[training_testing_data['group'] == 'training'].drop(non_feature_columns, axis=1) estimator = LinearSVC() rfe = RFE(estimator, 100, step=0.1) rfe.fit(training_features, training_classes) mask = rfe.get_support(True) mask_cols = list(training_features.iloc[:, mask].columns) + non_feature_columns assert np.array_equal(training_testing_data[mask_cols], tpot_obj._rfe(training_testing_data, 64, 0.1))
def get_patient_predictions_rfe(self,expression_file,ic50_file,patient_directory,target_features,drug): e_data,e_target,p_identifiers,p_data = dfm.get_cell_line_and_patient_expression_data_target_for_drug(expression_file,ic50_file,patient_directory,1.0,drug) step_length = int(len(e_data.tolist()[0]) / 100) + 1 model = RFE(self.model,target_features,step=step_length) model.fit(e_data,e_target) predictions = model.predict(p_data) all_features = dfm.get_cell_line_and_patient_expression_gene_intersection(dfm.get_cell_line_expression_frame(expression_file),dfm.get_patients_expression_frame(patient_directory))[0] top_features = [all_features[i] for i in xrange(0,len(all_features)) if model.support_[i]] return p_identifiers, predictions, top_features
def show_most_informative_features(self, samples): X, y = self._fsets2dataset(samples) rfe = RFE(self._clf, 1) rfe.fit(X, y) ranking = rfe.ranking_ if len(ranking) != len(self._fx): logging.error("Both feature ranking and features should have the same" "length %d != %d", len(ranking), len(self._fx)) fx_ranking = [] for i in range(len(self._fx)): fx_ranking.append((ranking[i], self._fx[i])) self._clf.fit(X, y) return '\n'.join(['\t'.join([str(y),str(x)]) for x,y in sorted(fx_ranking)])
def RecursiveFeatureElimination(self, nfeat=None, step=1, inplace=False): rfe = RFE(self.alg, n_features_to_select=nfeat, step=step) rfe.fit(self.data_train[self.predictors], self.data_train[self.target]) ranks = pd.Series(rfe.ranking_, index=self.predictors) selected = ranks.loc[rfe.support_] if inplace: self.set_predictors(selected.index.tolist()) return selected
def rank_features_rfe(X, y, featureset): """Rank features by their importance using recursive feature elimination. :param X: A training set of features. :param y: A target set (aka class labels for the training set) :param featureset: An instance of a featureset (such as Basic9Extractor()) :rtype: An OrderedDict of the form {K : V}, with K being the feature name and V being its importance. This dictionary will be sorted by importance. """ # FIXME: Use an RBF SVC to rank features. It is likely that the "importance" # rankings derived from a LinearSVC are similar as an RBF kernel SVM, but, # for safety's sake, it is best to assume they are not. classifier = LinearSVC() classifier.fit(X, y) ranker = RFE(classifier, 1, step=1) ranker = ranker.fit(X, y) # Get the names of the feature columns. # FIXME: Duplicate code from rank_features. Make this its own function. feat_importance = OrderedDict() for index, func in enumerate(featureset.features): feat_importance[func] = ranker.ranking_[index] return sorted(feat_importance.items(), key=lambda x: x[1])
def get_best_cols(df): """ select best cols with RFE """ # factors cols_to_factor = [ pd.get_dummies(df.X7), pd.get_dummies(df.X8), pd.get_dummies(df.X9), pd.get_dummies(df.X11), pd.get_dummies(df.X12), pd.get_dummies(df.X14), pd.get_dummies(df.X12), pd.get_dummies(df.X14), pd.get_dummies(df.X32), ] # dataframe with factors blown out df_f = pd.concat(cols_to_factor, axis=1) # numerics RFE_col_list = ["X4", "X5", "X6", "X13", "X21", "X22", "X29", "X30", "X31"] # dataframe with numerics df_n = df.ix[:, RFE_col_list] X = np.asarray(df_n) X = StandardScaler().fit_transform(X) # add in factors X = np.concatenate([X, np.asarray(df_f)], axis=1) # leave y alone y = df.X1 # I don't like to guess yes this is only linear relationships estimator = SVR(kernel="linear") selector = RFE(estimator, 40, step=2) selector = selector.fit(X, y) # make index for merged df, yes this whines df_index = df_n.columns + df_f.columns best_cols = df_index[selector.support_] return best_cols
def LogReg(X_train, X_test, y_train, y_test, Min_N_Feat, Max_N_Feat, mask='None',weights='auto'): #****************************************************************************** from sklearn.feature_selection import RFE #import the library to rank features with recursive feature elimination from sklearn.linear_model import LogisticRegression as LogR #import the Logistic Regression module if mask=='None': mask = np.zeros((Max_N_Feat-Min_N_Feat+1,int(X_train.shape[1])),dtype='bool') #define the mask to obtain the list of selected features #end Pred_Train = np.zeros((int(max(y_train.shape)),Max_N_Feat-Min_N_Feat+1),dtype='int') #define the matrix of outputs (each prediction set is stored in a different column) Pred_Test = np.zeros((int(max(y_test.shape)),Max_N_Feat-Min_N_Feat+1),dtype='int') #define the matrix of outputs (each prediction set is stored in a different column) print 'Logistic Regression: Training...' #notify the user about the status of the process for ift in range(Min_N_Feat,Max_N_Feat+1): #iterate across the maximum number of features LogReg_obj = LogR(C=1e3, class_weight=weights) #create the logistic regression model if mask=='None': rfe = RFE(LogReg_obj, ift) #create the RFE model and select the number of attributes rfe = rfe.fit(X_train,y_train) #train the RFE (feature selection) model on the train data sets mask[ift-Min_N_Feat,:] = rfe.support_ #apply the best feature mask to the output mask #end LogReg_obj.fit(X_train[:,mask[ift-Min_N_Feat,:]], y_train) #fit the logistic model to the train data sets Pred_Train[:,ift-1] = LogReg_obj.predict(X_train[:,mask[ift-Min_N_Feat,:]]) #apply the logistic model to the train dataset Pred_Test[:,ift-1] = LogReg_obj.predict(X_test[:,mask[ift-Min_N_Feat,:]]) #apply the logistic model to the test dataset print 'Logistic Regression: Predicting...', 100*ift/(Max_N_Feat-Min_N_Feat+1), '%' #notify the user about the status of the process #end print 'Logistic Regression: Completed!' #notify the user about the status of the process return Pred_Train, Pred_Test, mask
def ref(X, y, n_features_to_select=1, kernel='linear'): # specify the desired number of features # return the masks and ranking of selected features estimator = SVC(kernel=kernel, class_weight='balanced') selector = RFE(estimator, n_features_to_select=n_features_to_select, step=1) selector = selector.fit(X, y) return (selector)
def test_main(): iris = load_iris() x, y = iris.data, iris.target estimator = SVR(kernel="linear") selector = RFE(estimator, 2 , step=1) selector = selector.fit(x, y) print selector.support_
def rank_features(clf, x_train, y_train, columns,step=1, numFeatures=1): """ rank features with rfe :param clf: estimator :param x_train: :param y_train: :return: the fitted rfe object """ print '========== rank_features ===========' rfe = RFE(estimator=clf, n_features_to_select=numFeatures, verbose=2, step=step) rfe.fit(x_train, y_train) pprint(np.array(columns)[rfe.ranking_-1]) return rfe
def select_features(X, y, random_state, kernel='linear', C=1.0, num_attributes=3): """ Uses Support Vector Classifier as the estimator to rank features with Recursive Feature Eliminatin. Parameters ---------- X: A pandas.DataFrame. Attributes. y: A pandas.DataFrame. Labels. random_state: A RandomState instance. Used in SVC(). kernel: A string. Used in SVC(). Default: "linear". C: A float. Used in SVC(). Default: 1.0. num_attributes: An int. The number of features to select in RFE. Default: 3. Returns ------- A 3-tuple of (RFE, np.ndarray, np.ndarray) model: An RFE instance. columns: Selected features. ranking: The feature ranking. Selected features are assigned rank 1. """ rfe = RFE(svm.SVC(C, kernel, random_state=random_state), num_attributes) model = rfe.fit(X, y.values.ravel()) columns = list() for idx, label in enumerate(X): if rfe.support_[idx]: columns.append(label) ranking = rfe.ranking_ return model, columns, ranking
def feature_sorting(features_values_temp, rows_temp, columns_temp, prediction_values_temp, kernel, threshold): rows = 0 while rows_temp > 0: rows = rows + 1 rows_temp = rows_temp - 1 columns = 0 while columns_temp > 0: columns = columns + 1 columns_temp = columns_temp - 1 features_values = [x for x in features_values_temp] prediction_values = [y for y in prediction_values_temp] rotated = convert_list_to_matrix(features_values, rows, columns) # print rotated.shape scores = np.array(prediction_values) threshold = float(threshold) estimator = SVR(kernel=kernel) # try to change to the model for which the test is gonna run (lasso, ridge, etc.) selector = RFE(estimator, 0, step=1) selector = selector.fit(rotated, scores) features_used = [i for i, x in enumerate(selector.support_) if x == True] # i+1 b/c matlab starts indexing from 1 return selector.ranking_.tolist()
def select_features(X_train, y_train): threshold = 0.90 vt = VarianceThreshold().fit(X_train) feat_var_threshold = X_train.columns[vt.variances_ > threshold * (1 - threshold)] # print(feat_var_threshold) # print(len(feat_var_threshold)) # Random Forest feature importance model = RandomForestClassifier() model.fit(X_train, y_train) feature_imp = pd.DataFrame(model.feature_importances_, index=X_train.columns, columns=["importance"]) # print(feature_imp) feat_imp_20 = feature_imp.sort_values("importance", ascending=False).head(35).index # print(feat_imp_20) X_minmax = MinMaxScaler(feature_range=(0, 1)).fit_transform(X_train) X_scored = SelectKBest(score_func=chi2, k='all').fit(X_minmax, y_train) feature_scoring = pd.DataFrame({ 'feature': X_train.columns, 'score': X_scored.scores_ }) feat_scored_20 = feature_scoring.sort_values('score', ascending=False).head(35)['feature'].values # print(feat_scored_20) rfe = RFE(LogisticRegression(), 20) rfe.fit(X_train, y_train) feature_rfe_scoring = pd.DataFrame({ 'feature': X_train.columns, 'score': rfe.ranking_ }) feat_rfe_20 = feature_rfe_scoring[feature_rfe_scoring['score'] == 1]['feature'].values # print(feat_rfe_20) features = np.hstack([ feat_var_threshold, feat_imp_20, feat_scored_20, feat_rfe_20 ]) # print(features) # features = map(str, features) features = np.unique(features) # print('Final features set:\n') # for f in features: # print("\t-{}".format(f)) return features
def buildTree(self,depth): #Here, we define the parameters of our tree and use a feature selection algorithm (RFE) to pick out the strongest features. self.tree = DecisionTreeClassifier(criterion = 'entropy', max_depth=depth, random_state=0) selector = RFE(self.tree, 2, step=1) selector = selector.fit(self.X_train, self.Y_train) selector.support_ selector.ranking_
def selectFeaturesFromSubsetRecursive(self,subset,numFeatures): model = svm.LinearSVC(class_weights='auto') rfe = RFE(model, numFeatures) rfe = rfe.fit(self.instances[:,subset], self.classes) # summarize the selection of the attributes # print(rfe.get_support(indices=True)) # print(rfe.ranking_) return rfe.get_support(indices=True)
def feature_selection(self, **kwargs): x, y = kwargs['x'], kwargs['y'] fiter = self.get_fiter() selector = RFE(fiter) selector.fit(x, y) ZLog.info('RFE selection') ZLog.info(pd.DataFrame({'support': selector.support_, 'ranking': selector.ranking_}, index=self.df.columns[1:])) selector = RFECV(fiter, cv=3, scoring='mean_squared_error') selector.fit(x, y) ZLog.newline() ZLog.info('RFECV selection') ZLog.info(pd.DataFrame({'support': selector.support_, 'ranking': selector.ranking_}, index=self.df.columns[1:]))
def rec_feature_elim(data,num_features=17700): X = data.get_gene_exp_matrix() y = data.get_labels() svc = SVC(kernel="linear", C=1) rfe = RFE(estimator=svc, n_features_to_select=num_features, step=1) selector = rfe.fit(X, y) mask = map(lambda x: 1 if x is True else 0,selector.support_) print_genes_nonzero_coeff(data,mask)
def recursive_feature_elimination(X, y): model = LogisticRegression() # create the RFE model and select 3 attributes rfe = RFE(model, 3) rfe = rfe.fit(X, y) # summarize the selection of the attributes print(rfe.support_) print(rfe.ranking_)
# selecting x and y variables cr.shape cr_x = cr.iloc[:, 0:11] cr_y = cr.iloc[:, -1] # feature selection using rfe import pandas as pd from sklearn.feature_selection import RFE from sklearn.svm import LinearSVC svm = LinearSVC() rfe = RFE(svm, 5) rfe.fit(cr_x, cr_y) rfe.transform(cr_x) rfe.get_support() imp_variables = pd.DataFrame({ "Important": list(rfe.get_support()), "Feature_Name": list(cr_x.columns) }) imp_variables # feature selection using variance threshold from sklearn.feature_selection import VarianceThreshold from sklearn.feature_selection import SelectFromModel from sklearn.ensemble import RandomForestClassifier from sklearn.feature_selection import SelectKBest
mean_squared_error(y_test, pred) """#### selecting feature for logistic regression""" from sklearn.feature_selection import RFE model_log = LogisticRegression() selector = RFE(model_log, 3) x = df_voice.loc[:, :'modindx'] y = df_voice['label'] x.shape selector = selector.fit(x, y) selector.support_ """#### 6,9,13 using feature selection #### selecting feature for svm """ model_svc = SVC(kernel="rbf") selector2 = RFE(model_svc, 3) x = df_voice.loc[:, :'modindx'] y = df_voice['label'] selector2 = selector2.fit(x, y)
def TTest_mRMR_svmRFE_selector(originData): selectedFeatutesList = [] label = originData['label'] colNames = originData[originData.columns[2:8]].columns data = originData[originData.columns[2:8]].fillna(0) data = data.astype(np.float64) data = StandardScaler().fit_transform(data) # minmax_scale = preprocessing.MinMaxScaler().fit(data) # data = minmax_scale.transform(data) data = pd.DataFrame(data) data.columns = colNames data['label'] = label # balanced Data smo = SMOTE(random_state=3) X_smote, y_smote = smo.fit_sample(data, data['label']) for colName in X_smote.columns[0:-1]: # if 'DWI' in colName: if levene(X_smote[X_smote['label'] == 0][colName], X_smote[X_smote['label'] == 1][colName])[1] > 0.05 and \ ttest_ind(X_smote[X_smote['label'] == 0][colName], X_smote[X_smote['label'] == 1][colName])[ 1] < 0.05: selectedFeatutesList.append(colName) elif levene(X_smote[X_smote['label'] == 0][colName], X_smote[X_smote['label'] == 1][colName])[1] <= 0.05 and \ ttest_ind(X_smote[X_smote['label'] == 0][colName], X_smote[X_smote['label'] == 1][colName], equal_var=False)[1] < 0.05: selectedFeatutesList.append(colName) if 'label' not in selectedFeatutesList: selectedFeatutesList = ['label'] + selectedFeatutesList # print(index) data1 = X_smote[X_smote['label'] == 0][selectedFeatutesList] data2 = X_smote[X_smote['label'] == 1][selectedFeatutesList] trainData = pd.concat([data1, data2]) # trainData = shuffle(trainData) # trainData.index = range(len(trainData)) # 打乱后重新标号 X = trainData[trainData.columns[1:]] y = trainData['label'] # print(X_Smote) # mRMR_features = pymrmr.mRMR(X_smote, 'MIQ', 15) # define MI_FS feature selection method feat_selector = mifs.MutualInformationFeatureSelector(method='JMIM') feat_selector.fit(X, y) # feat_selector._support_mask # feat_selector.ranking_ # call transform() on X to filter it down to selected features # X_filtered = feat_selector.transform(X_smote) # X_filtered = pd.DataFrame(X_filtered) # print(feat_selector.ranking_) # if 'label' not in mRMR_features: mRMR_features = ['label'] + mRMR_features X_mRMR = X.loc[:, feat_selector._support_mask] colNames = X_mRMR.columns clf = LinearSVC() # featureNums = len(selectedFeatutesList) # print(featureNums) model = RFE(clf, n_features_to_select=len(feat_selector.ranking_)) # print(y) # print(X_mRMR) model.fit(X_mRMR, y) feats = list(np.array(colNames)[model.support_]) for featureNames in feats: print(featureNames) print(len(feats)) X_RFE = X_mRMR[feats] return X_RFE, y
from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) from sklearn import metrics from sklearn.ensemble import ExtraTreesClassifier extra_trees_model = ExtraTreesClassifier() extra_trees_model.fit(X_train, y_train) extra_trees_predicted = extra_trees_model.predict(X_test) from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression model1 = LogisticRegression() rfe_model = RFE(model1, 3) rfe_model = rfe_model.fit(X_train, y_train) rfe_predicted = rfe_model.predict(X_test) from sklearn.linear_model import LogisticRegression from sklearn import metrics logistic_model = LogisticRegression() logistic_model.fit(X_train, y_train) logistic_model_predicted = logistic_model.predict(X_test) from sklearn import metrics from sklearn.naive_bayes import GaussianNB gaussian_model = GaussianNB() gaussian_model.fit(X_train, y_train) gaussian_model_predicted = gaussian_model.predict(X_test) from sklearn import metrics
def multipleregress(): #IDEAL IS FOR THE INDEPENDENT VARIABLE TO BE CORRELATED WITH THE DEPENDENT VARIABLE BUT NOT #WITH EACH OTHER #Select the Columns that ONLY Use NUMBERS dataTrain = pd.read_csv('./tmdb_5000_train.csv') dataTest = pd.read_csv('./tmdb_5000_test.csv') x_train = dataTrain[['budget', 'popularity', 'vote_count']].values.reshape(-1, 3) y_train = dataTrain['revenue'] x_test = dataTest[['budget', 'popularity', 'vote_count']].values.reshape(-1, 3) y_test = dataTest['revenue'] ols = LinearRegression() model = ols.fit(x_train, y_train) dataTrain = pd.read_csv('./tmdb_5000.csv', usecols=[ 'budget', 'popularity', 'runtime', 'vote_average', 'vote_count', 'IMDB', 'rotten', 'metaC', 'revenue' ]) dataTest = pd.read_csv('./tmdb_5000_test.csv', usecols=[ 'budget', 'popularity', 'runtime', 'vote_average', 'vote_count', 'IMDB', 'rotten', 'metaC', 'revenue' ]) names = dataTrain.columns array = dataTrain.values X = array[:, 0:8] Y = array[:, 2] # feature extraction model = LinearRegression() rfe = RFE(model, 4) fit = rfe.fit(X, Y) print(fit.n_features_) print(fit.support_) print(fit.ranking_) ranks = fit.support_ fields = np.where(ranks == True) ranks = list() for ind in np.nditer(fields): ranks.append(names[ind]) print(ranks) x_train = dataTrain[['rotten', 'IMDB', 'vote_average']].values.reshape(-1, 3) y_train = dataTrain['revenue'] x_test = dataTest[['rotten', 'IMDB', 'vote_average']].values.reshape(-1, 3) y_test = dataTest['revenue'] ols = LinearRegression() model = ols.fit(x_train, y_train) params = np.append(model.intercept_, model.coef_) predictions = model.predict(x_train) print(predictions) newX = pd.DataFrame({ "Constant": np.ones(len(x_test)) }).join(pd.DataFrame(x_test)) MSE = (sum((y_train - predictions)**2)) / (len(newX) - len(newX.columns)) # Note if you don't want to use a DataFrame replace the two lines above with # newX = np.append(np.ones((len(X),1)), X, axis=1) # MSE = (sum((y-predictions)**2))/(len(newX)-len(newX[0])) var_b = MSE * (np.linalg.inv(np.dot(newX.T, newX)).diagonal()) sd_b = np.sqrt(var_b) ts_b = params / sd_b p_values = [ 2 * (1 - stats.t.cdf(np.abs(i), (len(newX) - 1))) for i in ts_b ] sd_b = np.round(sd_b, 3) ts_b = np.round(ts_b, 3) p_values = np.round(p_values, 3) params = np.round(params, 4) myDF3 = pd.DataFrame() myDF3["Coefficients"], myDF3["Standard Errors"], myDF3["t values"], myDF3[ "Probabilites"] = [params, sd_b, ts_b, p_values] print(myDF3) y_predicted = model.predict(x_train) plt.scatter(y_train, y_predicted) plt.plot(y_train, y_predicted, 'o') plt.show()
selection.sort() for i in selection[:20]: print(i) ### variable list from RFECV selected_val0 = X_standardized_train.columns[selector.support_] print(selected_val0) # In[19]: # further reduce by RFE X_standardized_train1 = X_standardized_train[X_standardized_train.columns[ selector.support_]] selector1 = RFE(estimator, n_features_to_select=17, step=1) selector1 = selector1.fit(X_standardized_train1, y_train) selection1 = list( zip(selector1.ranking_, selector1.support_, X_standardized_train1.columns)) selection1.sort() for i in selection1[:20]: print(i) selected_val1 = X_standardized_train1.columns[selector1.support_] print(selected_val1) # In[201]: ### clean dataset for modeling #selected_val = selected_val0
def extractWavFeats(): featureArr = [] labelsArr = [] for i in range(43): coughfeat = extract_features("Regen_coofs/cough" + str(i) + ".wav") nonCough = extract_features("Regen_coofs/neg" + str(i) + ".wav") featureArr.append(coughfeat) featureArr.append(nonCough) labelsArr.append(0) labelsArr.append(1) for i in range(30): nonCough = extract_features("Regen_coofs/neg" + str(i) + ".wav") featureArr.append(nonCough) labelsArr.append(1) return featureArr, labelsArr #Optional Load xArr = np.genfromtxt("x.csv", delimiter=",") yArr = np.genfromtxt("y.csv", delimiter=",") x_train, x_test, y_train, y_test = train_test_split(xArr[:], yArr[:], test_size=0.2, random_state=42) print("Done Splitting") forest = rf(max_depth=10) selector = RFE(forest, 10, 1) fit = selector.fit(x_train, y_train) print(fit.score(xArr[0:80], yArr[0:80]))
# But as I have wrangled the data to provide a column that does acknowledge whether or not the user is adopted, # I am more willing to fit a model on it and conduct feature elimination via that. # My initial thoughts are to use RFE, or Decision Trees after thinking this through. # The following features have been removed as they seem redundant in inclusion, or provide no empirical predictive value: # 'object_id', 'name', and 'email' # Data selection for model X = user_df[[ 'creation_time', #1 'creation_source', #3 'last_session_creation_time', #1 'opted_in_to_mailing_list', #4 'enabled_for_marketing_drip', #5 'org_id', #1 'invited_by_user_id' #2 ]] y = user_df['adopted'] # Decision Tree Model clf = DecisionTreeClassifier(random_state=0) # Recursive feature selection estimator = clf selector = RFE(estimator, 3, step=1) selector = selector.fit(X, y) # Ranking of features print("Feature Ranking: ", selector.ranking_) # [1 3 1 4 5 1 2] # The features that seems to be most important are the 'creation_time', 'last_session_creation_time', and 'org_id' # The feature that seemed to follow close behind was 'invited_by_user_id' # These features seem to be the most important when predicting future adoption of users
def fit(self, X, y): X_t = X.copy() y_t = y.copy() print('Filling Nans') if self.fill_nan: X_t = self.filler.fit_transform(X_t) print('Removing outliers') if self.contamination > 0: method = LocalOutlierFactor(n_neighbors=max( 50, int(0.1 * X_t.shape[1])), contamination=self.contamination) outlier = method.fit_predict(X_t) indices = np.where(outlier == 1) X_t = X_t[indices, :][0, :, :] y_t = y_t[indices] if self.feature_selection == 'RFE': print('Removing features with zero variance') sel = VarianceThreshold() sel.fit(X_t) self.not_constant_features.extend(sel.get_support(indices=True)) X_t = X_t[:, self.not_constant_features] print('Removing uniform features') if self.bootstrap: self.not_uniform_features.extend( pd.read_csv('task1/results/not_uniform.csv', ',').to_numpy().flatten()) else: result = self.find_uniform_features(X_t, y_t) pd.DataFrame(result).to_csv('task1/results/not_uniform.csv', ',', index=False) self.not_uniform_features.extend(result) X_t = X_t[:, self.not_uniform_features] print('Removing highly correlated features') self.correlated_features.extend( self.find_correlated_features(0.9, 0.03, X_t, y_t)) X_t = X_t[:, self.correlated_features] print('Running RFE') selector = RFE(estimator=self.model, n_features_to_select=self.features_to_select, step=100) selector = selector.fit(X_t, y_t) support = selector.get_support(indices=True) self.RFE_features.extend(support) X_t = X_t[:, self.RFE_features] print('Final training matrix shape is ' + str(X_t.shape)) print('Scaling matrix') if self.scale: X_t = self.scaler.fit_transform(X_t) print('Fitting inner model') self.model.fit(X_t, y_t) print('Finished fitting') print() return self
def train_model(classifier): if (classifier == 'LR'): model = LogisticRegression(random_state=seed) model.fit(X_train, y_train) return model if (classifier == 'KNN'): print("\n K TREINO TESTE") print(" -- ------ ------") for k in range(1, 130, 2): model = KNeighborsClassifier(n_neighbors=k, weights='uniform', metric='minkowski', p=2) model = model.fit(X_train, y_train) y_resposta_treino = model.predict(X_train) y_resposta_teste = model.predict(X_test) acuracia_treino = sum(y_resposta_treino == y_train) / len(y_train) acuracia_teste = sum(y_resposta_teste == y_test) / len(y_test) print("%3d" % k, "%6.1f" % (100 * acuracia_treino), "%6.1f" % (100 * acuracia_teste)) return model if (classifier == 'SV'): model = SVC(kernel='linear', random_state=seed) # kernel = 'rbf' model.fit(X_train, y_train) return model if (classifier == 'NB'): model = GaussianNB() model.fit(X_train, y_train) return model if (classifier == 'DT'): model = DecisionTreeClassifier(criterion='entropy', random_state=seed) model.fit(X_train, y_train) return model if (classifier == 'RF'): # Hiper-parâmetros selecionados após a busca: model = RandomForestClassifier(n_estimators=1600, min_samples_split=2, min_samples_leaf=4, max_features='sqrt', max_depth=10, bootstrap=True, random_state=seed) model.fit(X_train, y_train) print(model.feature_importances_) return model if (classifier == 'RG'): model = RidgeClassifier(alpha=1, class_weight='balanced', solver='auto') model.fit(X_train, y_train) return model if (classifier == 'GBC'): # Hiper-parâmetros selecionados após a busca: model = GradientBoostingClassifier( random_state=seed, n_estimators=200, min_samples_split=5, min_samples_leaf=1, max_features='sqrt', max_depth=10, ) rfe = RFE(model) rfe = rfe.fit(X_train, y_train) return rfe if (classifier == 'MLP'): kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed) cvscores = [] for treino, teste in kfold.split(X_train, y_train): model = tf.keras.models.Sequential() model.add(tf.keras.layers.Dense(units=20, activation='relu')) model.add(tf.keras.layers.Dense(units=10, activation='relu')) model.add(tf.keras.layers.Dense(units=1, activation='sigmoid')) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) model.fit(X_train, y_train, batch_size=32, epochs=100, verbose=0) scores = model.evaluate(X_test, y_test, verbose=0) print("%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100)) cvscores.append(scores[1] * 100) print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores))) model.summary() return model
ridge = Ridge(alpha=ridgecv.alpha_) ridge.fit(data, mark) algorithm["Ridge"] = rank_to_dict(np.abs(ridge.coef_), names) #lasso lassocv = LassoCV() lassocv.fit(data, mark) #print(lassocv.alpha_) lasso = Lasso(alpha=lassocv.alpha_) lasso.fit(data, mark) algorithm["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names) #rfe log = LogisticRegression() rfe = RFE(log, n_features_to_select=10) rfe.fit(data, mark) algorithm["RFE"] = rank_to_dict(list(map(float, rfe.ranking_)), names, order=-1) ''' #f值检验 f, pval = f_classif(data, mark) algorithm["Corr"] = rank_to_dict(f, names) ''' r = {} for name in names: r[name] = round( np.mean([algorithm[method][name] for method in algorithm.keys()]), 4) methods = sorted(algorithm.keys()) algorithm["Mean"] = r methods.append("Mean")
a digit classification task. .. note:: See also :ref:`example_feature_selection_plot_rfe_with_cross_validation.py` """ print(__doc__) from sklearn.svm import SVC from sklearn.datasets import load_digits from sklearn.feature_selection import RFE # Load the digits dataset digits = load_digits() X = digits.images.reshape((len(digits.images), -1)) y = digits.target # Create the RFE object and rank each pixel svc = SVC(kernel="linear", C=1) rfe = RFE(estimator=svc, n_features_to_select=1, step=1) rfe.fit(X, y) ranking = rfe.ranking_.reshape(digits.images[0].shape) # Plot pixel ranking import matplotlib.pyplot as plt plt.matshow(ranking) plt.colorbar() plt.title("Ranking of pixels with RFE") plt.show()
X_train = X_train_scaled X_test_scaled = pd.DataFrame(sc_X.transform(X_test)) X_test_scaled.columns = X_test.columns.values X_test_scaled.index = X_test.index.values X_test = X_test_scaled # Feature Selection by using Recursive Feature Elimination # Model to Test model = LogisticRegression(random_state = 0) # Select Best X Features rfe = RFE(model, 20) rfe = rfe.fit(X_train, y_train) # summarize the selection of the attributes # selected features are assigned True value rfe.support_ # selected features are assigned rank 1 rfe.ranking_ X_train.columns[rfe.support_] # Correlation Matrix sn.set(style="white") # Compute the correlation matrix corr = X_train[X_train.columns[rfe.support_]].corr()
array = dataframe.values #Split the data into input and target #There are 73 features in Writeprints Dataset X = array[:,0:73] Y = array[:,73] #np.random.seed(20) #model = LogisticRegression(np.random.seed(20)) #model=pickle.load(open('model_feature_selection', 'rb')) #pickle.dump(model, open('model_feature_selection', 'wb')) model = LogisticRegression(random_state=20) # create the RFE model and select 3 attributes rfe = RFE(model, 50) rfe = rfe.fit(X, Y) rankings= list(rfe.ranking_) #print rankings.count(1) #print rfe.support_ # summarize the selection of the attributes np.set_printoptions(precision=3) selected_feature_names=[] for i in range(0,len(rfe.support_)): if rfe.support_[i]==True:
print(data.describe()) ## plotting import matplotlib.pyplot as plt import matplotlib matplotlib.style.use('ggplot') #data.boxplot() #data.hist() #data.groupby('class').hist() #data.groupby('class').plas.hist(alpha=0.4) from pandas.tools.plotting import scatter_matrix #scatter_matrix(data, alpha=0.2, figsize=(16.0, 16.0), diagonal='kde') #plt.savefig(r"scatter_matrix_pima.png") # Recursive Feature Elimination from sklearn import datasets from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression # load the iris datasets dataset = datasets.load_iris() # create a base classifier used to evaluate a subset of attributes model = LogisticRegression() # create the RFE model and select 3 attributes rfe = RFE(model, 3) rfe = rfe.fit(dataset.data, dataset.target) # summarize the selection of the attributes print(rfe.support_) print(rfe.ranking_)
features = fit.transform(X) # In[28]: features[0:20, :] # In[29]: from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression # In[30]: model = LogisticRegression() rfe = RFE(model, 3) fit = rfe.fit(X, Y) result = fit.transform(X) print("Num Features: ", fit.n_features_) print("Selected Features: ", fit.support_) print("Feature Ranking: ", fit.ranking_) print("\n\n\n", result[:20, :]) # In[31]: from sklearn.decomposition import PCA # In[32]: pca = PCA(n_components=3)
# Applying models from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier import sklearn.feature_selection import matplotlib.pyplot as plt model = ExtraTreesClassifier() model2 = RandomForestClassifier() model.fit(X_train_res, y_train_res) model2.fit(X_train_res, y_train_res) # Recursive Feature Elimination # create the RFE model and select 4 attributes from sklearn.feature_selection import RFE rfe = RFE(model, 4) rfe = rfe.fit(X_train_res, y_train_res) rfe2 = RFE(model, 4) rfe2 = rfe.fit(X_train_res, y_train_res) # summarize the selection of the attributes print("ForExtraTreesClassifier:By RFE") print(rfe.support_) print(rfe.ranking_) print("RandomForestClassifier by RFE:") print(rfe.support_) print(rfe.ranking_) print("ForExtraTreesClassifier by FE:") print(model.feature_importances_) print("RandomForestClassifier by FE:") print(model.feature_importances_ ) #use inbuilt class feature_importances of tree based classifiers
def run(self): loanfreature_df = pd.read_csv( processData(loginemail=self.loginemail, loginpassword=self.loginpassword).output().path, low_memory=False, encoding='ISO-8859-1') Y = loanfreature_df.int_rate loanfreature_df.drop('int_rate', axis=1, inplace=True) cols_to_keep = [ 'loan_amnt', 'term', 'emp_length', 'home_ownership_category', 'annual_inc', 'verification_status_category', 'purpose', 'addr_state', 'dti', 'delinq_2yrs', 'last_meanfico', 'inq_last_6mths', 'open_acc', 'revol_bal', 'revol_util', 'total_acc', 'mths_since_last_major_derog', 'funded_amnt_inv', 'installment', 'application_type', 'pub_rec', 'addr_state' ] loanfreature_df = loanfreature_df[cols_to_keep] loanfreature_df = createDummies(loanfreature_df) X = loanfreature_df._get_numeric_data() names = ["%s" % i for i in X] ranks = {} lr = LinearRegression(normalize=True) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=DeprecationWarning) lr.fit(X, Y) ranks["Linear reg"] = rank_to_dict((lr.coef_), names) ridge = Ridge(alpha=7) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=DeprecationWarning) ridge.fit(X, Y) ranks["Ridge"] = rank_to_dict((ridge.coef_), names) lasso = Lasso(alpha=.05) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=DeprecationWarning) lasso.fit(X, Y) ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), names) rlasso = RandomizedLasso(alpha=0.00) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=DeprecationWarning) rlasso.fit(X, Y) ranks["Stability"] = rank_to_dict((rlasso.scores_), names) rf = RandomForestRegressor() with warnings.catch_warnings(): warnings.simplefilter("ignore", category=DeprecationWarning) rf.fit(X, Y) ranks["RF"] = rank_to_dict(rf.feature_importances_, names) # stop the search when 5 features are left (they will get equal scores) rfe = RFE(lr, n_features_to_select=15) with warnings.catch_warnings(): warnings.simplefilter("ignore", category=DeprecationWarning) rfe.fit(X, Y) ranks["RFE"] = rank_to_dict(rfe.ranking_, X.columns, order=-1) f, pval = f_regression(X, Y, center=True) ranks["Corr."] = rank_to_dict(f, names) r = {} for name in names: r[name] = round( np.mean([ranks[method][name] for method in ranks.keys()]), 2) methods = sorted(ranks.keys()) ranks["Mean"] = r methods.append("Mean") # f_rank = pd.DataFrame() print("\t%s" % "\t".join(methods)) temp = "\t".join(methods) f = open("testing.txt", 'w') f.write(temp) f.write("\n") for name in names: temp = name + "\t" + " \t".join( map(str, [ranks[method][name] for method in methods])) f.write(temp) f.write("\n") print("%s\t%s" % (name, "\t".join( map(str, [ranks[method][name] for method in methods])))) f.close() feature = pd.read_csv('testing.txt', sep='\t') feature.to_csv(self.output().path)
#!/usr/bin/env python # -*- coding: utf-8 -*- """ @Time : 2018/9/29 17:36 @Author : LI Zhe """ import pandas as pd from sklearn.svm import SVR from sklearn.datasets import load_digits from sklearn.feature_selection import RFE import matplotlib.pyplot as plt data_train = pd.read_csv('../data/new_train_feature.csv', low_memory=False, encoding='gbk') train_x = data_train.iloc[:, :-1] train_y = data_train.iloc[:, -1] # Create the RFE object svr = SVR(kernel="linear", C=1) rfe = RFE(estimator=svr, n_features_to_select=20, step=1) rfe.fit(train_x, train_y) ranking = rfe.ranking_.reshape(train_x[0].shape) plt.matshow(ranking) plt.colorbar() plt.title("Ranking of pixels with RFE") plt.show()
series = pd.Series.from_csv('./dataset/monthly-car-sales-in-quebec-1960.csv', header=0) # 平稳化 diff = series.diff(12)[12:] # 自相关图 plot_acf(diff) plot_pacf(diff) # 创建一系列滞后数据 df = pd.DataFrame() df['t'] = diff for i in range(1, 13): df['t-{0}'.format(str(i))] = diff.shift(i) df = df.iloc[12:, :] # 随机森林计算特征重要性 X = df.values[:, 1:] y = df.values[:, 0] model = rfr(500, random_state=1) model.fit(X, y) fi = model.feature_importances_ plt.bar(np.arange(1, fi.size + 1), fi) # RFE选择特征 rfe = RFE(rfr(500, random_state=1), 4) fit = rfe.fit(X, y) print(df.columns[1:][fit.support_]) plt.bar(np.arange(1, fit.support_.size + 1), fit.support_) plt.bar(np.arange(1, fit.ranking_.size + 1), fit.ranking_)
data2 = data.sample(frac=1).reset_index(drop=True) random.shuffle(headers2) #split the truth data and the deccriptors y = data2['result'] finaldata = data2[headers2] F1Scores = pd.DataFrame(columns = ['F1']) #custom recursive feature elimination for nfeat in range(10, 100, 1): print('the number of features is ', nfeat) #RFE works over the entire dataset selector = RFE(estimator = logmodel, n_features_to_select = nfeat, step = 10) selector = selector.fit(finaldata, y) rfe_fits = selector.ranking_ columnNames = finaldata.columns rankedColumnns_Raw = pd.DataFrame(data = {'Rank':selector.ranking_, 'Name':columnNames}) #data to use evalData = list(rankedColumnns_Raw[rankedColumnns_Raw['Rank']==1]['Name'] ) tempSelDesc = rankedColumnns_Raw.loc[rankedColumnns_Raw['Rank']==1].reset_index(drop = True).drop(labels = ['Rank'], axis = 1) tempSelDesc = tempSelDesc.rename(columns={'Name': nfeat}) #add the data from this iteration to the existing data selectedDesc = pd.concat([selectedDesc,tempSelDesc], ignore_index = True, sort = False, axis = 1)
X = df_final.loc[:, df_final.columns != 'y'] y = df_final.loc[:, df_final.columns == 'y'] #%% start over-sampling by importing SMOTE (Synthetic Minority Oversampling Technique) from sklearn.model_selection import train_test_split #train_test_split on predictors X and target Y X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) columns = X_train.columns #columns is a list of the predictor labels #%% Recursive feature selection for regresssion from sklearn.linear_model import LogisticRegression as LR df_final_vars=df_final.columns.values.tolist() y=['y'] X=[i for i in df_final_vars if i not in y] from sklearn.feature_selection import RFE logreg = LR(solver='liblinear', max_iter=200) rfe = RFE(logreg, 20) rfe = rfe.fit(X_train, y_train.values.ravel()) #%% rfe_result= pd.DataFrame(list(zip(X_train.columns.values,rfe.support_,rfe.ranking_)),columns=['predictor', 'yes', 'rank']) rfe_selected=rfe_result[rfe_result['yes']==1].predictor #%% rfe_selected=[ele for ele in rfe_selected if ele not in {'marital_unknown', 'default_no', 'default_unknown', 'contact_cellular', 'contact_telephone', 'poutcome_failure', 'poutcome_success', 'poutcome_nonexistent'}] X=X_train[rfe_selected] y=y_train['y'] #%% X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) logreg = LR(solver='lbfgs', max_iter=200) logreg.fit(X_train, y_train) y_pred_lr = logreg.predict(X_test) print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test))) #%% run Log regression from sklearn.metrics import confusion_matrix
dataset1 = dataset.drop(['Unnamed: 0'], axis=1) trg = dataset1[['Y']] trn = dataset1.drop(['Y'], axis=1) Y = np.array(trg, dtype=np.float32) X = np.array(trn, dtype=np.float32) normalized_X = preprocessing.normalize(trn) standardized_X = preprocessing.scale(trn) model = ExtraTreesClassifier() model.fit(trn, trg) print(model.feature_importances_) model = LinearRegression() # create the RFE model and select 3 attributes rfe = RFE(model, 3) rfe = rfe.fit(trn, trg) # summarize the selection of the attributes print(rfe.support_) print(rfe.ranking_) model = LinearRegression() model.fit(trn, trg) print(model) # make predictions expected = trg predicted = model.predict(trn) # summarize the fit of the model print(metrics.classification_report(expected, predicted)) print(metrics.confusion_matrix(expected, predicted)) #models = [LinearRegression(),#метод наименьших квадратов
sub_out = np.unique(sub_labels)[it] # sub_out = np.unique(sub_labels)[np.random.randint(0, 20, 1)] train_inds = org_train_inds[np.logical_not(train_subs == sub_out)] test_inds = org_test_inds[np.logical_not(test_subs == sub_out)] best_pvalue = 1 best_acc = 0 # for k in [80]: for k in np.arange(0, 1000, 25)[1:]: print('-' * 80) print('k=%i' % k) selector = RFE(clf, n_features_to_select=k, step=0.5, verbose=00) selector = selector.fit(FS_mask[train_inds], labels[train_inds]) clf.fit(FS_mask[train_inds][:, selector.support_], labels[train_inds]) acc = clf.score(FS_mask[test_inds][:, selector.support_], labels[test_inds]) print('Total acc: %.3f' % acc) # dump meta_space = np.zeros(selector.support_.shape, dtype=np.float32) meta_space[selector.support_] = clf.coef_ brain_coef_nii = meta_mask.inverse_transform(meta_space) brain_coef_nii.to_filename( 'train_verbs_nomen_predict_hand_objects_0.54accuracy.nii.gz') meta_space[selector.support_] = clf.coef_
x_validation = x_use.iloc[454:605, :] y_validation = y[454:605] x_test = x_use.iloc[605:757, :] y_test = y[605:757] ######normalizaton scaler = preprocessing.StandardScaler().fit(x_train) n_x_train = scaler.transform(x_train) n_x_validation = scaler.transform(x_validation) n_x_test = scaler.transform(x_test) ######rfe svc = SVC(kernel="linear") #model = LogisticRegression() #设置算法为逻辑回归 rfe = RFE(svc, n_features_to_select=100) #选择100个最佳特征变量,并进行RFE selector = rfe.fit(n_x_train, y_train) #进行RFE递归 selector.support_ selector.ranking_ new_x_train = n_x_train[:, selector.support_] new_x_validation = n_x_validation[:, selector.support_] new_x_test = n_x_test[:, selector.support_] new_x_train.shape new_x_validation.shape new_x_test.shape rfe_columns = selector.support_ if name == 'all_subset': rfe_baseline = np.array(rfe_columns[1:23]) rfe_time_frequency = np.array(rfe_columns[23:34]) rfe_vocal_fold = np.array(rfe_columns[34:56])
'''machine learning modeling''' '''feature engineering (find the variables that gives max R2 accuracy score)''' from sklearn.linear_model import LinearRegression from sklearn.feature_selection import RFE estimator = LinearRegression() #use regression model for regression problem list_r2=[] max_r2 = 0 for i in range(1,len(X_scaled.loc[0])+1): selector = RFE(estimator, i, step=1) selector = selector.fit(X_scaled, y_scaled) adj_r2 = 1 - ((len(X_scaled)-1)/(len(X_scaled)-i-1))*(1-selector.score(X_scaled, y_scaled)) list_r2.append(adj_r2)# mse = if max_r2 < adj_r2: sel_features = selector.support_ max_r2 = adj_r2 X_sub = X_scaled.iloc[:,sel_features] X_sub.columns.tolist() #selected features #split training and test set from sklearn.model_selection import train_test_split X_train,X_test,y_train,y_test=train_test_split(X_sub,y,random_state=0)
clf = linear_model.Lasso(alpha=0.1) res = clf.fit(train_features, train_labels) score = res.score(test_features, test_labels) print("LASSO regression has a score of {} out of sample".format( score.round(4))) # #### let's be more strict about features - rank and remove # In[72]: from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression logreg = LogisticRegression() rfe = RFE(logreg, 20) rfe = rfe.fit(features, labels.values.ravel()) print(rfe.support_) print(rfe.ranking_) rfe.score(test_features, test_labels) # ### remove a few features according to rfe results # In[73]: labels = varSelection["defaulted"] features = varSelection.drop(columns=[ "defaulted", "loan_status", "application_type_JOINT", "home_ownership_ANY", "home_ownership_NONE" ]) features.info()
from sklearn.feature_selection import RFE from sklearn.svm import SVR import pandas as pd from sklearn.linear_model import LinearRegression import numpy as np data = pd.read_csv("python-ml-course-master/datasets/ads/Advertising.csv") features_cols = ["TV", "Radio", "Newspaper"] x= data[features_cols] y = data["Sales"] estimator = SVR(kernel = "linear") #crea un modelo lineal selector = RFE(estimator, 2, step=1) #Le pedimos que deje el modelo en 2 variables predictoras Recursive Feature Elimination selector = selector.fit(x,y) print(selector.support_) print(selector.ranking_) X_pred = x[["TV","Radio"]] lm = LinearRegression() #Crea el modelo de regresion lineal lm.fit(X_pred, y) #Ajusta el modelo a nuestros datos print(lm.intercept_) #Alpha print(lm.coef_) #Bethas print(lm.score(X_pred, y)) #R2
# Read contents of the file dataframe = pandas.read_csv('https://modcom.co.ke/bigdata/datasets/pima.csv') pandas.set_option('display.max_columns', 9) print(dataframe) array = dataframe.values print(array) X = array[:, 0:8] print(X) y = array[:, 8] print(y) # Identify features that won't be good predictors from sklearn.feature_selection import RFE rfc = RandomForestClassifier(n_estimators=40) rfe = RFE(rfc, 5) fitted = rfe.fit(X, y) print('Selected columns: ', fitted.support_) # Create a new dataset for the best predictors subset = dataframe[([ 'Glucose', 'BloodPressure', 'BMI', 'DiabetesPedigreeFunction', 'Age' ])] print(subset) # Obtain the values of the new dataset subsetArray = subset.values Xnew = subsetArray[:, 0:5] print(Xnew) # Establish the training and testing sets from sklearn import model_selection X_train, X_test, y_train, y_test = model_selection.train_test_split( Xnew, y, test_size=0.10, random_state=7) # Pick an algorithm
Labels = URLS['Result'] Training_Data, Testing_Data = train_test_split(URLS_Without_Labels, test_size=0.25, random_state=150) Training_Labels, Testing_Labels = train_test_split(Labels, test_size=0.25, random_state=150) Model = LogisticRegression(random_state=0) Rfe = RFE(Model, 15) Fit = Rfe.fit(Training_Data, Training_Labels) Prediction_Labels = Rfe.predict(Testing_Data) New_Data = Rfe.transform(URLS_Without_Labels) # print(label.shape) df = pd.DataFrame(New_Data) df.to_csv('RFElogreg.csv') Confusion_Matrix = confusion_matrix(Testing_Labels, Prediction_Labels) print("\nNumber Of Features: %d\n" % Fit.n_features_) print("Selected Features: \n%s\n" % Fit.support_) print("Feature Ranking: \n%s\n" % Fit.ranking_) print("Training Accuracy Score Obtained is: {0:.2f}%".format(