def rank_features_rfe(X, y, featureset): """Rank features by their importance using recursive feature elimination. :param X: A training set of features. :param y: A target set (aka class labels for the training set) :param featureset: An instance of a featureset (such as Basic9Extractor()) :rtype: An OrderedDict of the form {K : V}, with K being the feature name and V being its importance. This dictionary will be sorted by importance. """ # FIXME: Use an RBF SVC to rank features. It is likely that the "importance" # rankings derived from a LinearSVC are similar as an RBF kernel SVM, but, # for safety's sake, it is best to assume they are not. classifier = LinearSVC() classifier.fit(X, y) ranker = RFE(classifier, 1, step=1) ranker = ranker.fit(X, y) # Get the names of the feature columns. # FIXME: Duplicate code from rank_features. Make this its own function. feat_importance = OrderedDict() for index, func in enumerate(featureset.features): feat_importance[func] = ranker.ranking_[index] return sorted(feat_importance.items(), key=lambda x: x[1])
def select_features(X, y, random_state, kernel='linear', C=1.0, num_attributes=3): """ Uses Support Vector Classifier as the estimator to rank features with Recursive Feature Eliminatin. Parameters ---------- X: A pandas.DataFrame. Attributes. y: A pandas.DataFrame. Labels. random_state: A RandomState instance. Used in SVC(). kernel: A string. Used in SVC(). Default: "linear". C: A float. Used in SVC(). Default: 1.0. num_attributes: An int. The number of features to select in RFE. Default: 3. Returns ------- A 3-tuple of (RFE, np.ndarray, np.ndarray) model: An RFE instance. columns: Selected features. ranking: The feature ranking. Selected features are assigned rank 1. """ rfe = RFE(svm.SVC(C, kernel, random_state=random_state), num_attributes) model = rfe.fit(X, y.values.ravel()) columns = list() for idx, label in enumerate(X): if rfe.support_[idx]: columns.append(label) ranking = rfe.ranking_ return model, columns, ranking
def get_best_cols(df): """ select best cols with RFE """ # factors cols_to_factor = [ pd.get_dummies(df.X7), pd.get_dummies(df.X8), pd.get_dummies(df.X9), pd.get_dummies(df.X11), pd.get_dummies(df.X12), pd.get_dummies(df.X14), pd.get_dummies(df.X12), pd.get_dummies(df.X14), pd.get_dummies(df.X32), ] # dataframe with factors blown out df_f = pd.concat(cols_to_factor, axis=1) # numerics RFE_col_list = ["X4", "X5", "X6", "X13", "X21", "X22", "X29", "X30", "X31"] # dataframe with numerics df_n = df.ix[:, RFE_col_list] X = np.asarray(df_n) X = StandardScaler().fit_transform(X) # add in factors X = np.concatenate([X, np.asarray(df_f)], axis=1) # leave y alone y = df.X1 # I don't like to guess yes this is only linear relationships estimator = SVR(kernel="linear") selector = RFE(estimator, 40, step=2) selector = selector.fit(X, y) # make index for merged df, yes this whines df_index = df_n.columns + df_f.columns best_cols = df_index[selector.support_] return best_cols
def recursive_feature_elimination(config_learning, config_data, number_features): output = open(os.path.expanduser(config_data.get("Learner", "models")) + "/" + "feature_ranks.txt", "w") feature_names = FeatureExtractor.get_combinations_from_config_file_unsorted(config_data) x_train = read_features_file(config_learning.get('x_train'), '\t') y_train = read_reference_file(config_learning.get('y_train'), '\t') x_test = read_features_file(config_learning.get('x_test'), '\t') estimator, scorers = learn_model.set_learning_method(config_learning, x_train, y_train) scale = config_learning.get("scale", True) if scale: x_train, x_test = scale_datasets(x_train, x_test) rfe = RFE(estimator, number_features, step=1) rfe.fit(x_train, y_train) for i, name in enumerate(feature_names): output.write(name + "\t" + str(rfe.ranking_[i]) + "\n") print(name + "\t" + str(rfe.ranking_[i])) predictions = rfe.predict(x_test) output.close() return predictions
def test_deeply_nested(): # Render a deeply nested estimator rfe = RFE(RFE(RFE(RFE(RFE(RFE(RFE(LogisticRegression()))))))) expected = """ RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=RFE(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, l1_ratio=None, max_iter=100, multi_class='warn', n_jobs=None, penalty='l2', random_state=None, solver='warn', tol=0.0001, verbose=0, warm_start=False), n_features_to_select=None, step=1, verbose=0), n_features_to_select=None, step=1, verbose=0), n_features_to_select=None, step=1, verbose=0), n_features_to_select=None, step=1, verbose=0), n_features_to_select=None, step=1, verbose=0), n_features_to_select=None, step=1, verbose=0), n_features_to_select=None, step=1, verbose=0)""" expected = expected[1:] # remove first \n assert rfe.__repr__() == expected
def recursiveFeatureSelector(classifier_model,train_data,train_labels,test_data,number_of_features): rfe = RFE(classifier_model,number_of_features) transformed_train_data = rfe.fit_transform(train_data,train_labels) transformed_test_data = rfe.transform(test_data) return transformed_train_data,transformed_test_data
def doRFE(self, X, y): # do RFE self.numFeatures = X.shape[1] svc = SVC(kernel="linear", C=self.C) rfe = RFE(estimator=svc, n_features_to_select=1, step=1) rfe.fit(X, y) self.feature_importances_ = self._getImportances(rfe.ranking_)
def feature_sorting(features_values_temp, rows_temp, columns_temp, prediction_values_temp, kernel, threshold): rows = 0 while rows_temp > 0: rows = rows + 1 rows_temp = rows_temp - 1 columns = 0 while columns_temp > 0: columns = columns + 1 columns_temp = columns_temp - 1 features_values = [x for x in features_values_temp] prediction_values = [y for y in prediction_values_temp] rotated = convert_list_to_matrix(features_values, rows, columns) # print rotated.shape scores = np.array(prediction_values) threshold = float(threshold) estimator = SVR(kernel=kernel) # try to change to the model for which the test is gonna run (lasso, ridge, etc.) selector = RFE(estimator, 0, step=1) selector = selector.fit(rotated, scores) features_used = [i for i, x in enumerate(selector.support_) if x == True] # i+1 b/c matlab starts indexing from 1 return selector.ranking_.tolist()
def test_main(): iris = load_iris() x, y = iris.data, iris.target estimator = SVR(kernel="linear") selector = RFE(estimator, 2 , step=1) selector = selector.fit(x, y) print selector.support_
def ref(X, y, n_features_to_select=1, kernel='linear'): # specify the desired number of features # return the masks and ranking of selected features estimator = SVC(kernel=kernel, class_weight='balanced') selector = RFE(estimator, n_features_to_select=n_features_to_select, step=1) selector = selector.fit(X, y) return (selector)
def recursiveFeatureSelection(): X = np.array(trainingData, dtype=float) y = np.array(trainingDataLabels, dtype=float) svc = SVC("linear", 1) rfe = RFE(svc, 1, 1) rfe.fit(X, y) print rfe
def featSelect(label,trainSet,trainObs,cv,numFeat=5,SEED=34,name=''): from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression from sklearn.metrics import roc_auc_score from numpy import zeros model = LogisticRegression(random_state=SEED) predCv = zeros(len(trainObs)) rfe = RFE(model, numFeat, step=1) rfe.fit(trainSet,trainObs) vars = list(trainSet.columns[rfe.ranking_ == 1]) auc = 0 for i in range(1,max(rfe.ranking_)): for tr, vl in cv: model.fit(trainSet[vars + list(trainSet.columns[rfe.ranking_ == i])].ix[tr],trainObs[tr]) predCv[vl] = model.predict_proba(trainSet[vars + list(trainSet.columns[rfe.ranking_ == i])].ix[vl])[:,1] if roc_auc_score(trainObs,predCv) > auc: auc = roc_auc_score(trainObs,predCv) vars += list(trainSet.columns[rfe.ranking_ == i]) for v in vars: for tr, vl in cv: model.fit(trainSet[[x for x in vars if x != v]].ix[tr],trainObs[tr]) predCv[vl] = model.predict_proba(trainSet[[x for x in vars if x != v]].ix[vl])[:,1] if roc_auc_score(trainObs,predCv) > auc: auc = roc_auc_score(trainObs,predCv) vars.remove(v) for v in [x for x in trainSet.columns if x not in vars]: for tr, vl in cv: model.fit(trainSet[vars + [v]].ix[tr],trainObs[tr]) predCv[vl] = model.predict_proba(trainSet[vars + [v]].ix[vl])[:,1] if roc_auc_score(trainObs,predCv) > auc: auc = roc_auc_score(trainObs,predCv) vars += [v] print name,"Final AUC: ",auc return {label: vars}
def get_model_RFE_top_features(self,expression_file,ic50_file,target_features,drug): expression_frame,ic50_series = dfm.get_expression_frame_and_ic50_series_for_drug(expression_file, ic50_file,drug,normalized=True,trimmed=True,threshold=None) scikit_data,scikit_target = dfm.get_scikit_data_and_target(expression_frame,ic50_series) step_length = int(len(scikit_data.tolist()[0]) / 100) + 1 selector = RFE(self.model,int(target_features),step=step_length) selector.fit(scikit_data,scikit_target) return [expression_frame.index[i] for i in xrange(0,len(expression_frame.index)) if selector.support_[i]]
def LogReg(X_train, X_test, y_train, y_test, Min_N_Feat, Max_N_Feat, mask='None',weights='auto'): #****************************************************************************** from sklearn.feature_selection import RFE #import the library to rank features with recursive feature elimination from sklearn.linear_model import LogisticRegression as LogR #import the Logistic Regression module if mask=='None': mask = np.zeros((Max_N_Feat-Min_N_Feat+1,int(X_train.shape[1])),dtype='bool') #define the mask to obtain the list of selected features #end Pred_Train = np.zeros((int(max(y_train.shape)),Max_N_Feat-Min_N_Feat+1),dtype='int') #define the matrix of outputs (each prediction set is stored in a different column) Pred_Test = np.zeros((int(max(y_test.shape)),Max_N_Feat-Min_N_Feat+1),dtype='int') #define the matrix of outputs (each prediction set is stored in a different column) print 'Logistic Regression: Training...' #notify the user about the status of the process for ift in range(Min_N_Feat,Max_N_Feat+1): #iterate across the maximum number of features LogReg_obj = LogR(C=1e3, class_weight=weights) #create the logistic regression model if mask=='None': rfe = RFE(LogReg_obj, ift) #create the RFE model and select the number of attributes rfe = rfe.fit(X_train,y_train) #train the RFE (feature selection) model on the train data sets mask[ift-Min_N_Feat,:] = rfe.support_ #apply the best feature mask to the output mask #end LogReg_obj.fit(X_train[:,mask[ift-Min_N_Feat,:]], y_train) #fit the logistic model to the train data sets Pred_Train[:,ift-1] = LogReg_obj.predict(X_train[:,mask[ift-Min_N_Feat,:]]) #apply the logistic model to the train dataset Pred_Test[:,ift-1] = LogReg_obj.predict(X_test[:,mask[ift-Min_N_Feat,:]]) #apply the logistic model to the test dataset print 'Logistic Regression: Predicting...', 100*ift/(Max_N_Feat-Min_N_Feat+1), '%' #notify the user about the status of the process #end print 'Logistic Regression: Completed!' #notify the user about the status of the process return Pred_Train, Pred_Test, mask
def remove_one_feature(X, Y, names): lr = LinearRegression() rfe = RFE(lr, n_features_to_select=1) rfe.fit(X,Y) rank = (sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), names))) print(rank) return rank[-1][1]
def selectFeaturesFromSubsetRecursive(self,subset,numFeatures): model = svm.LinearSVC(class_weights='auto') rfe = RFE(model, numFeatures) rfe = rfe.fit(self.instances[:,subset], self.classes) # summarize the selection of the attributes # print(rfe.get_support(indices=True)) # print(rfe.ranking_) return rfe.get_support(indices=True)
def buildTree(self,depth): #Here, we define the parameters of our tree and use a feature selection algorithm (RFE) to pick out the strongest features. self.tree = DecisionTreeClassifier(criterion = 'entropy', max_depth=depth, random_state=0) selector = RFE(self.tree, 2, step=1) selector = selector.fit(self.X_train, self.Y_train) selector.support_ selector.ranking_
def rec_feature_elim(data,num_features=17700): X = data.get_gene_exp_matrix() y = data.get_labels() svc = SVC(kernel="linear", C=1) rfe = RFE(estimator=svc, n_features_to_select=num_features, step=1) selector = rfe.fit(X, y) mask = map(lambda x: 1 if x is True else 0,selector.support_) print_genes_nonzero_coeff(data,mask)
def build_model(x,y,no_features): """ Build a linear regression model """ model = LinearRegression(normalize=True,fit_intercept=True) rfe_model = RFE(estimator=model,n_features_to_select=no_features) rfe_model.fit(x,y) return rfe_model
def recursive_feature_elimination(X, y): model = LogisticRegression() # create the RFE model and select 3 attributes rfe = RFE(model, 3) rfe = rfe.fit(X, y) # summarize the selection of the attributes print(rfe.support_) print(rfe.ranking_)
def feature_selection(X, y): model = LR() rfe = RFE(model, 10) fit = rfe.fit(X, y) print("Num Features: %d") % fit.n_features_ print("Selected Features: %s") % fit.support_ print("Feature Ranking: %s") % fit.ranking_ print fit.score(X, y) return fit.transform(X)
def quick_rfe(estimator, X, y): rfe = RFE(estimator = estimator, n_features_to_select = 1) rfe.fit(X,y) features = X.columns.tolist() sorted_features = [f for (rank, f) in sorted(zip(rfe.ranking_, features))] return sorted_features, rfe.ranking_
class LogReg: """ Initialization sets the objects model, vectorizer, labels, and corpus variables. Initialization also performs the initial training for the model and vectorizer using the given reviews. """ def __init__( self, reviews, vectorizer = TfidfVectorizer(stop_words = 'english', max_df = 1, ngram_range = (1, 2)), model = LogisticRegression() ): self.model = model self.vectorizer = vectorizer self.selector = RFE(self.model, step = 100, verbose = 100) corpus = [] labels = [] for review in reviews: corpus += [review[1]["text"]] labels += [review[0]] #setting variables for the object self.corpus = corpus self.labels = labels self.reviews = reviews X = self.vectorizer.fit_transform(self.corpus) self.feature_names = self.vectorizer.get_feature_names() y = self.labels for string in self.feature_names: print(string.encode("ascii", 'ignore')) #Training the model X_new = self.selector.fit_transform(X, self.labels) self.model.fit(X_new, self.labels) def classify_all(self, all_test_data): test_corpus = [] y = [] for review in all_test_data: test_corpus += [review[1]['text']] y += [review[0]] #Used transform instead of fit_transform #for test data so number of features will match X = self.vectorizer.transform(test_corpus) X_new = self.selector.transform(X) results = self.model.predict(X_new) categories = ["spring", "summer", "fall", "winter"] for i, category in enumerate(categories): top10 = np.argsort(self.model.coef_[i])[-20:] for j in top10: print("%s: %s" % (category, "".join(self.feature_names[j]))) return results
def recurrciveFE(self, data): """ Uses Recurrcise Feature Elimination to determine the write number of features before adding additional leads to overfitting & It works by recursively removing attributes and building a model on those attributes that remain. It uses the model accuracy to identify which attributes (and combination of attributes) contribute the most to predicting the target attribute. Parameters ---------- data : DataFrame Input data, for which categorical variables should be converted response should be in 0 column, predictors in additional Returns ------- out : Plot A plot with the number of optimal number of features, which is then used to determine features of most importance returned in a print out to console """ features_list = data.columns.values[1::] predictors = np.asarray(data.values[:, 1::]) response = np.asarray(data.values[:, 0]) estimator = SVC(kernel="linear") ###using cross validation to determine nooffeatures rfecv = RFE(estimator, step=1, cv=StratifiedKFold(response, 2), scoring = 'accuracy') rfecv.fit(predictors, response) RFE( ) print("Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() ##label as optimal #of features noffeatures = rfecv.n_features_ ##use rfe to determine top features selector = RFE(estimator,noffeatures , step=1) selector = selector.fit(predictors, response) ##creat index to get names index1 = np.where(selector.support_ == False)[0] index = np.argsort(selector.ranking_[index1])[::-1] feature_list_imp = features_list[index] for f in range(index.shape[0]): print("%d. feature %d (%s)" % (f + 1, index[f], feature_list_imp[index[f]])) print(selector.support_) print(selector.ranking_)
def recursive_fs(X, y, clf, num_features): # create the RFE model and select 3 attributes rfe = RFE(clf, num_features) start = time.time() rfe = rfe.fit(X, y) # summarize the selection of the attributes end = time.time() print ("Training Time: " + str((end - start)) + "s") return rfe
def feature_selection(estimator, x, y): """ 支持度评级 """ selector = RFE(estimator) selector.fit(x, y) print('RFE selection') print(pd.DataFrame( {'support': selector.support_, 'ranking': selector.ranking_}, index=pig_three_feature.columns[1:]))
def trainDesicionTreeClassifier(): modelDesicionTree=DecisionTreeClassifier(max_depth=5) # set the number of features to 10 rfedecisiontree = RFE(modelDesicionTree, 10) rfedecisiontree = rfedecisiontree.fit(X_train, y_train) print("Feature Importance of Decision Tree Model") print(rfedecisiontree.support_) print(rfedecisiontree.ranking_) modelDesicionTree.fit(X_train, y_train) return modelDesicionTree
def rank(training_set, paradigm_lengths, category_description): transfomer = DataTransformer(training_set, paradigm_lengths, category_description) headlines, matrix, targets = transfomer.get_training_data_matrix(normalize=True) matrix = matrix.toarray() estimator = svm.SVC(C=1, kernel='linear') selector = RFE(estimator, 1, step=1) selector = selector.fit(matrix, targets) for i in range(len(headlines)): print headlines[i], selector.ranking_[i]
def trainLogisticRegression(): modelLogisticRegression=LogisticRegression() #set the number of features to 10 rfelogisticReg=RFE(modelLogisticRegression,10) rfelogisticReg=rfelogisticReg.fit(X_train, y_train) print("Feature Importance of Logistic Regression Model") print(rfelogisticReg.support_) print(rfelogisticReg.ranking_) modelLogisticRegression.fit(X_train, y_train) return modelLogisticRegression
def select_features(X, y, clf=None, n_features=10): if not clf: clf = LogisticRegression() clf.fit(X, y) selector = RFE(clf, n_features_to_select=n_features) selector = selector.fit(X, y) features = np.array(range(57)) # print selector.ranking_ # print selector.support_ return features[selector.support_]
######################### transfomers = [DummyTransformer, Normalizer(), StandardScaler()] transfomers_cfg = {} transfomers_cfg[DummyTransformer.func.__name__] = {} transfomers_cfg[Normalizer.__name__] = dict( transfomer__norm=['l1', 'l2', 'max']) transfomers_cfg[StandardScaler.__name__] = {} ########################### ####Dim Reducer, Feat Sel.# ########################### reducers = [ DummyTransformer, PCA(), GenericUnivariateSelect(), RFE(ExtraTreesRegressor()) ] reducers_cfg = {} reducers_cfg[DummyTransformer.func.__name__] = {} reducers_cfg[PCA.__name__] = dict( reducer__n_components=[], # reducer__whiten = [True, False], reducer__svd_solver=['auto']) reducers_cfg[GenericUnivariateSelect.__name__] = dict( reducer__score_func=[f_regression], reducer__mode=['k_best'], reducer__param=[]) reducers_cfg[RFE.__name__] = dict(reducer__n_features_to_select=[], reducer__step=[0.1]) ######################### ####### Models ##########
# param_grid=param_grid, # scoring='accuracy', # cv=10, # n_jobs=-1) # pred = estimators[k].predict(X_test) # print("%s Score: %0.02f" % (k, estimators[k].score(X_test, y_test))) # scores = cross_validation.cross_val_score(estimators[k], X, y, cv=5) # print("%s Cross Avg. Score: %0.02f (+/- %0.02f)" % (k, scores.mean(), scores.std() * 2)) # end_time = datetime.datetime.now() # time_spend = end_time - start_time # print("%s Time: %0.02f" % (k, time_spend.total_seconds())) from sklearn.feature_selection import RFE rfe = RFE(clf, 41) clf1 = rfe.fit(X, y) clf1.score(X, y) yhat_test = clf1.predict_proba(X_test) clf1.score(X_test, y_test) #conduct grid search for the models: #logistic regression from sklearn.grid_search import GridSearchCV param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0] tuned_parameters = [{'C': param_range}] scores = ['precision', 'recall'] for score in scores: print("# Tuning hyper-parameters for %s" % score)
# ### Fit a logistic regression model # In[14]: lgr = LogisticRegression(C=5) lgr.fit(X,y) # ### Select best features using RFE feature selection # In[42]: from sklearn.feature_selection import RFE selector = RFE(lgr, 20) selector.fit_transform(X, y) ranks = selector.ranking_ X_names = encoded_df.columns.drop('good_bad') # print sorted(map(lambda x: round(x, 4), selector.ranking_), names) # In[110]: rfe_features = np.column_stack((X_names, ranks)) rfe_cols = rfe_features[np.where(rfe_features[:,1]<10),:2][0] rfe_col1 = rfe_cols[:,:1] print(rfe_col1)
#testFeatures, testLabels = transformDataset(test_sents) corpus=[d for (d,c) in documents] labels=[c for (d,c) in documents] features=tfidf(corpus) #print(features[1]) #features,labels=transformDataset(documents) #vec = DictVectorizer() #features_new=vec.fit_transform(features).toarray() #print(features_new.shape) print(len(features)) print(len(labels)) svc = SVC(kernel="linear", C=1) clf = RFE(svc, 300, step=1) fe = clf.fit_transform(features, labels) #print(fit.scores_) print(fe.shape) # summarize selected features trainFeatures, testFeatures, trainLabels, testLabels = train_test_split(fe,labels, test_size=0.33, random_state=42) print("length of testLabels=",len(testLabels)) #for l in testLabels: # print("label=",l) #print("features=",trainFeatures[1],"label=",trainLabels[1]) #featuresets = [(document_features(d), c) for (d,c) in documents] var = 1
def classify_one_vs_many(df, model_name, model, feature_to_class, type_class, type_0_class=None): GH_df_reduced_one_vs_many = df.copy() if type_0_class is None: others_df = GH_df_reduced_one_vs_many[( GH_df_reduced_one_vs_many[feature_to_class] != type_class)].copy() others_df.loc[:, 'ml_type'] = type_0_class = 'others' else: others_df = GH_df_reduced_one_vs_many[( GH_df_reduced_one_vs_many[feature_to_class] == type_0_class )].copy() others_df.loc[:, 'ml_type'] = type_0_class category_df = GH_df_reduced_one_vs_many[ GH_df_reduced_one_vs_many[feature_to_class] == type_class].copy() category_df.loc[:, 'ml_type'] = type_class df_merged = pd.concat([others_df, category_df], ignore_index=True) # print df_merged.groupby(['ml_type','category'])['analizo_accm_mean'].count() X = df_merged.select_dtypes(include=[np.number]) y = df_merged.loc[:, 'ml_type'] test_size = 0.2 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) ros = RandomOverSampler(random_state=0) if len(y_train.unique()) < 2: print('cannot fit for {}'.format(type_class)) return None X_resampled, Y_resampled = ros.fit_resample(X_train, y_train) # print('Training target statistics: {}'.format(Counter(y))) if Counter(y)[type_class] == 1: print('cannot fit for {}'.format(type_class)) return model.fit(X_train, y_train) # print model.score(X_test,y_test) rfe = RFE(model, 4) fit = rfe.fit(X_train, y_train) # print "Selected features : " + str(X.columns[fit.support_]) pred = model.predict(X_test) # print Counter(pred) # df_accurarcy = set_wrong_type(pred,y, df_merged,type_class) # calculate_accurarcy(df_accurarcy,pred,y,type_class) fpr = tpr = roc_auc = None t = True try: y_pred = model.predict_proba(X_test)[:, 1] except: t = False if t: fpr, tpr, _ = roc_curve(y_test, y_pred, pos_label=type_class) roc_auc = auc(tpr, fpr) f1 = f1_score(y_test, pred, pos_label=type_class) return { 'model_name': model_name, 'agent_type': agent_type, 'feature_importance': fit, 'model': model, 'fpr': fpr, 'tpr': tpr, 'auc': roc_auc, 'f1_score': f1, 'class 0': type_0_class, 'class 1': type_class }
def main(training_input_path, testing_input_path, output_path): # LOAD DATA train = pd.read_csv(training_input_path, header=0) test = pd.read_csv(testing_input_path, header=0) # PREPROCESSING le = LabelEncoder() train["ocean_proximity"] = le.fit_transform(train["ocean_proximity"]) test["ocean_proximity"] = le.transform(test["ocean_proximity"]) # SPLIT TRAINING AND TESTING DATA INTO X AND Y X_train = train.drop(columns="median_house_value") y_train = train['median_house_value'] X_test = test.drop(columns="median_house_value") y_test = test['median_house_value'] # CREATE A DF THAT EXCLUDES LATITUDE AND LONGITUDE X_train_featexc = X_train.drop(columns=["latitude", "longitude"]) X_test_featexc = X_test.drop(columns=["latitude", "longitude"]) # CREATE A DF THAT EXCLUDES LATITUDE, LONGITUDE, AND TOTAL BEDROOMS X_train_featexc_2 = X_train.drop( columns=["latitude", "longitude", "total_bedrooms"]) X_test_featexc_2 = X_test.drop( columns=["latitude", "longitude", "total_bedrooms"]) # APPLY SCALER scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) X_train_featexc = scaler.fit_transform(X_train_featexc) X_test_featexc = scaler.transform(X_test_featexc) X_train_featexc_2 = scaler.fit_transform(X_train_featexc_2) X_test_featexc_2 = scaler.transform(X_test_featexc_2) # LINEAR REGRESSION WITH FEATURE SELECTION - ALL FEATURES AVAILABLE lr_response = { 'n_features_to_select': [], 'train_error': [], 'test_error': [] } for i in list(range(1, X_train.shape[1] + 1, 1)): lr_response['n_features_to_select'].append(i) rfe_lr = RFE(LinearRegression(), n_features_to_select=i) rfe_lr.fit(X_train, y_train) lr_response['train_error'].append( round(1 - rfe_lr.score(X_train, y_train), 3)) lr_response['test_error'].append( round(1 - rfe_lr.score(X_test, y_test), 3)) pd.DataFrame(lr_response).to_csv(output_path + 'lr_rfe_results_table.csv', index=False) # Plotting LR performance data = pd.DataFrame(lr_response).melt( id_vars='n_features_to_select', value_vars=['train_error', 'test_error']) plot = alt.Chart(data).mark_line().encode( x=alt.X('n_features_to_select:Q', title="Number of Features Selected"), y=alt.Y('value:Q', title="Error"), color=alt.Color('variable:N', title="Data Split")).properties( title="Recursive Feature Elimination Linear Regression Error", width=250, height=200) plot.save(output_path + 'LR_performace.png') # LINEAR REGRESSION WITH FEATURE SELECTION - EXCLUDING LATITUDE AND LONGITUDE lr_response_exc = { 'n_features_to_select': [], 'train_error': [], 'test_error': [] } for i in list(range(1, X_train_featexc.shape[1] + 1, 1)): lr_response_exc['n_features_to_select'].append(i) rfe_lr = RFE(LinearRegression(), n_features_to_select=i) rfe_lr.fit(X_train_featexc, y_train) lr_response_exc['train_error'].append( round(1 - rfe_lr.score(X_train_featexc, y_train), 3)) lr_response_exc['test_error'].append( round(1 - rfe_lr.score(X_test_featexc, y_test), 3)) pd.DataFrame(lr_response_exc).to_csv(output_path + 'lr_rfe_results_table_exc_feats.csv', index=False) # Plotting LR performance excluding latitude and longitude data = pd.DataFrame(lr_response_exc).melt( id_vars='n_features_to_select', value_vars=['train_error', 'test_error']) plot = alt.Chart(data).mark_line().encode( x=alt.X('n_features_to_select:Q', title="Number of Features Selected"), y=alt.Y('value:Q', title="Error"), color=alt.Color('variable:N', title="Data Split") ).properties( title= "Recursive Feature Elimination Linear Regression Error Excluding Latitude and Longitude", width=250, height=200) plot.save(output_path + 'LR_performace_exc_feats.png') # LINEAR REGRESSION WITH FEATURE SELECTION - EXCLUDING LATITUDE, LONGITUDE, AND TOTAL BEDROOMS lr_response_exc_2 = { 'n_features_to_select': [], 'train_error': [], 'test_error': [] } for i in list(range(1, X_train_featexc_2.shape[1] + 1, 1)): lr_response_exc_2['n_features_to_select'].append(i) rfe_lr = RFE(LinearRegression(), n_features_to_select=i) rfe_lr.fit(X_train_featexc_2, y_train) lr_response_exc_2['train_error'].append( round(1 - rfe_lr.score(X_train_featexc_2, y_train), 3)) lr_response_exc_2['test_error'].append( round(1 - rfe_lr.score(X_test_featexc_2, y_test), 3)) pd.DataFrame(lr_response_exc_2).to_csv( output_path + 'lr_rfe_results_table_exc_feats_2.csv', index=False) # Plotting LR performance excluding latitude and longitude data = pd.DataFrame(lr_response_exc_2).melt( id_vars='n_features_to_select', value_vars=['train_error', 'test_error']) plot = alt.Chart(data).mark_line().encode( x=alt.X('n_features_to_select:Q', title="Number of Features Selected"), y=alt.Y('value:Q', title="Error"), color=alt.Color('variable:N', title="Data Split") ).properties( title= "Recursive Feature Elimination Linear Regression Error Excluding Latitude, Longitude, and Total Bedrooms", width=250, height=200) plot.save(output_path + 'LR_performace_exc_feats_2.png') # KNN WITH VARYING N_NEIGHBOR VALUES WITH FULL DATA INCLUSION knn_response = {'n_neighbours': [], 'train_error': [], 'test_error': []} for i in list(range(1, 20, 1)): knn_response['n_neighbours'].append(i) knn = KNeighborsRegressor(n_neighbors=i) knn.fit(X_train, y_train) knn_response['train_error'].append( round(1 - knn.score(X_train, y_train), 3)) knn_response['test_error'].append( round(1 - knn.score(X_test, y_test), 3)) predictions = knn.predict(X_test) pd.DataFrame(knn_response).to_csv(output_path + 'knn_results_table.csv', index=False) # ploting KNN performance data = pd.DataFrame(knn_response).melt( id_vars='n_neighbours', value_vars=['train_error', 'test_error']) plot = alt.Chart(data).mark_line().encode( x=alt.X('n_neighbours:Q', title="Number of Nearest Neighbours"), y=alt.Y('value:Q', title="Error"), color=alt.Color('variable:N', title="Data Split")).properties( title="K-Nearest Neighbour Error when Varying K", width=250, height=200) plot.save(output_path + 'KNN_performace.png') # plotting KNN performance compared to actual values pred_estimates = pd.merge( pd.DataFrame(y_test), pd.DataFrame(predictions), left_index=True, right_index=True).rename(columns={ 0: "prediction", "median_house_value": "actual" }) pred_estimates = pd.melt(pred_estimates, value_vars=['actual', 'prediction']) plot = alt.Chart(pred_estimates).mark_bar(opacity=0.3).encode( alt.X('value:Q', bin=alt.Bin(maxbins=40), title="Median House Value"), alt.Y('count()', stack=None, title="Count"), alt.Color('variable', title="Value")).properties( title="Histogram of Actual and Predicted Median House Values", width=400, height=200) plot.save(output_path + 'KNN_actual_vs_predicted.png') # KNN WITH VARYING N_NEIGHBOR VALUES WITH LATITUDE AND LONGITUDE EXCLUSION knn_response_exc = { 'n_neighbours': [], 'train_error': [], 'test_error': [] } for i in list(range(1, 20, 1)): knn_response_exc['n_neighbours'].append(i) knn_exc = KNeighborsRegressor(n_neighbors=i) knn_exc.fit(X_train_featexc, y_train) knn_response_exc['train_error'].append( round(1 - knn_exc.score(X_train_featexc, y_train), 3)) knn_response_exc['test_error'].append( round(1 - knn_exc.score(X_test_featexc, y_test), 3)) predictions = knn_exc.predict(X_test_featexc) pd.DataFrame(knn_response_exc).to_csv(output_path + 'knn_results_table_exc_feats.csv', index=False) # ploting KNN performance data = pd.DataFrame(knn_response_exc).melt( id_vars='n_neighbours', value_vars=['train_error', 'test_error']) plot = alt.Chart(data).mark_line().encode( x=alt.X('n_neighbours:Q', title="Number of Nearest Neighbours"), y=alt.Y('value:Q', title="Error"), color=alt.Color('variable:N', title="Data Split") ).properties( title= "K-Nearest Neighbour Error when Varying K and Excluding Latitude and Longitude", width=250, height=200) plot.save(output_path + 'KNN_performace_exc_feats.png') # plotting KNN performance compared to actual values excluding latitude and longitude pred_estimates = pd.merge( pd.DataFrame(y_test), pd.DataFrame(predictions), left_index=True, right_index=True).rename(columns={ 0: "prediction", "median_house_value": "actual" }) pred_estimates = pd.melt(pred_estimates, value_vars=['actual', 'prediction']) plot = alt.Chart(pred_estimates).mark_bar(opacity=0.3).encode( alt.X('value:Q', bin=alt.Bin(maxbins=40), title="Median House Value"), alt.Y('count()', stack=None, title="Count"), alt.Color('variable', title="Value") ).properties( title= "Histogram of Actual and Predicted Median House Values Excluding Latitude and Longitude", width=400, height=200) plot.save(output_path + 'KNN_actual_vs_predicted_exc_feats.png') # RANDOM FOREST REGRESSOR rfr = RandomForestRegressor(random_state=522) gs = GridSearchCV(rfr, param_grid={ "max_depth": np.arange(5, 10, 1), "min_samples_leaf": np.arange(1, 4, 1) }) gs.fit(X_train, y_train) rfr = gs.best_estimator_ rfr_response = { 'type': ['Random Forest Regressor'], 'train_error': [round(1 - rfr.score(X_train, y_train), 3)], 'test_error': [round(1 - rfr.score(X_test, y_test), 3)] } pd.DataFrame(rfr_response).to_csv(output_path + 'rfr_results_table.csv', index=False) # TESTING assert os.path.isfile(output_path + 'rfr_results_table.csv') assert os.path.isfile(output_path + 'KNN_performace.png') assert os.path.isfile(output_path + 'lr_rfe_results_table.csv') assert os.path.isfile(output_path + 'LR_performace.png') assert os.path.isfile(output_path + 'rfr_results_table.csv') assert os.path.isfile(output_path + 'knn_results_table_exc_feats.csv') assert os.path.isfile(output_path + 'KNN_performace_exc_feats.png') assert os.path.isfile(output_path + 'lr_rfe_results_table_exc_feats.csv') assert os.path.isfile(output_path + 'LR_performace_exc_feats.png') assert os.path.isfile(output_path + 'lr_rfe_results_table_exc_feats_2.csv') assert os.path.isfile(output_path + 'LR_performace_exc_feats_2.png') assert os.path.isfile(output_path + 'KNN_actual_vs_predicted.png') assert os.path.isfile(output_path + 'KNN_actual_vs_predicted_exc_feats.png')
for o in range(0, 10): #split into test and train set F_Training_Train, F_Training_Test, Label_Training_Train, Label_Training_Test = train_test_split( features_training, label_training, test_size=0.33) F_Test_Train, F_Test_Test, Label_Test_Train, Label_Test_Test = train_test_split( features_test, label_test, test_size=0.70) #classification # clf = SVC(kernel='linear') # clf = LogisticRegression() # clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0) clf = GradientBoostingClassifier() #recursive feature elimination selector = RFE(clf, 1, step=1) Label_train = np.ravel(Label_Training_Train) Label_test = np.ravel(Label_Test_Test) selector = selector.fit(F_Training_Train, Label_train) rank = selector.ranking_ Rank.append(rank) rank = np.asarray(rank) #create a list that contains index numbe of ranked features rankedlist = np.zeros((7, 1)) #finding index of the ranked features and creating new training and test sets with respect to this ranking for m in range(1, 8): k = np.where(rank == m) rankedlist[m - 1] = k[0][0] F_Training_Train[:,
# acc = accuracy_score(y_test, y_pred) # print("Accuracy: {:.4%}".format(acc)) # print(classification_report(y_test, y_pred, digits=4)) seeds = 1618 # Регулируем значения псевдогенератора случайных чисел confusion_matrixs = [] # ОТБОР ПРИЗНАКОВ. Метод 1 model = ExtraTreesClassifier(random_state=seeds) model.fit(x_tr, y_tr) print(model.feature_importances_) # ОТБОР ПРИЗНАКОВ. Метод 2 model = LogisticRegression(random_state=seeds) # create the RFE model and select 3 attributes rfe = RFE(model, 2) rfe = rfe.fit(x_tr, y_tr) print(rfe.support_) print(rfe.ranking_) # Оба метода отбора признаков говорят о минимальном вкладе 2 и 4 компоненты. # Для текущего этапа(классификация первого столбца в справочнике). Можно их исключить, но у меня не так много признаков. Пока оставим. seeds = 1618 # Регулируем значения псевдогенератора случайных чисел ''' ==> ЛОГИСТИЧЕСКАЯ РЕГРЕССИЯ ''' # Часто используется для задач бинарной классификации, но допускается и многоклассовая классификация методом "one-vs-all". # Достоинством этого алгоритма являеся то, что на выходе для каждого обьекта мы имеем вероятсность принадлежности классу. model = LogisticRegression(random_state=1618, solver='lbfgs') model.fit(x_tr, y_tr)
# print(cor_feature) from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 from sklearn.preprocessing import MinMaxScaler X_norm = MinMaxScaler().fit_transform(X) chi_selector = SelectKBest(chi2, k=num_feats) chi_selector.fit(X_norm, y) chi_support = chi_selector.get_support() chi_feature = X.loc[:,chi_support].columns.tolist() print(str(len(chi_feature)), 'selected features') from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression rfe_selector = RFE(estimator=LogisticRegression(max_iter=1000), n_features_to_select=num_feats, step=10, verbose=5) rfe_selector.fit(X_norm, y) rfe_support = rfe_selector.get_support() rfe_feature = X.loc[:,rfe_support].columns.tolist() print(str(len(rfe_feature)), 'selected features') from sklearn.feature_selection import SelectFromModel from sklearn.linear_model import LogisticRegression embeded_lr_selector = SelectFromModel(LogisticRegression(solver='saga', penalty="l1", max_iter=1000), max_features=num_feats) embeded_lr_selector.fit(X_norm, y) embeded_lr_support = embeded_lr_selector.get_support() embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist() print(str(len(embeded_lr_feature)), 'selected features')
''' print('--FEATURE SELECTION ON--', '\n') ##1) Run Feature Selection ####### if fs_type == 1: #Stepwise Recursive Backwards Feature removal if binning == 0: clf = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=3, min_samples_leaf=1, max_features=None, random_state=rand_st) sel = RFE(clf, n_features_to_select=k_cnt, step=.1) print('Stepwise Recursive Backwards - Random Forest: ') if binning == 1: rgr = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=3, min_samples_leaf=1, max_features=None, random_state=rand_st) sel = RFE(rgr, n_features_to_select=k_cnt, step=.1) print('Stepwise Recursive Backwards - Random Forest: ') fit_mod = sel.fit(data_np, target_np) print(sel.ranking_) sel_idx = fit_mod.get_support()
colname=loan_train.columns[:] colname from sklearn import tree with open(r"XYZCorp_LendingData.txt", "w") as f: f = tree.export_graphviz(model_Decision_tree, feature_names= colname[:-1],out_file=f) #generate the file and upload the code in webgraphviz.com to plot the decision tree # feature importance attribute of decision tree print(list(zip(colname,model_Decision_tree.feature_importances_))) from sklearn.feature_selection import RFE rfe = RFE(classifier, 20) model_rfe = rfe.fit(X_train, Y_train) print("Num Features: ",model_rfe.n_features_) print("Selected Features: ") print(list(zip(loan_train.columns, model_rfe.support_))) print("Feature Ranking: ", model_rfe.ranking_) Y_pred=model_rfe.predict(X_test) #predicting using the Random_Forest_Classifier from sklearn.ensemble import RandomForestClassifier model_RandomForest=RandomForestClassifier(500) ###
for val in l[:-1]: j += 1 data[i][j] = float(val) X, y = data[:, :-1], data[:, -1] #y = np.array([y]) #y = np.reshape(y,(y.shape[1],y.shape[0])) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y, random_state=42) clf = SVC(gamma='auto', kernel='linear') selector = RFE(clf, 100, step=1) selector = selector.fit(X_train, y_train) y_pred = selector.estimator_.predict(X_test.compress(selector.support_, axis=1)) curr_pos = curr_neg = inc_pos = inc_neg = 0 for i in range(len(y_test)): if y_test[i] == 1: if y_pred[i] == 1: curr_pos += 1 else: inc_neg += 1 else: if y_pred[i] == 1:
##Link https://medium.com/@aneesha/recursive-feature-elimination-with-scikit-learn-3a2cbdf23fb7 # Feature Extraction with RFE from pandas import read_csv from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression # load data url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv" names = [ 'preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class' ] dataframe = read_csv(url, names=names) array = dataframe.values X = array[:, 0:8] Y = array[:, 8] # feature extraction model = LogisticRegression() rfe = RFE(model, 3) fit = rfe.fit(X, Y) print("Num Features: %d") % fit.n_features_ print("Selected Features: %s") % fit.support_ print("Feature Ranking: %s") % fit.ranking_
model = LogisticRegression(solver='lbfgs', max_iter=500) for i in range(1, df_X.shape[1]+1): fs = sorted_columns[0:i] df_X_selected = df_X[fs] scores = cross_val_score(model, df_X_selected, df_y, cv=5) print(fs.tolist()) print(np.round(scores.mean(), 4)) ###################################################################### # Backward elimination (Recursive Feature Elimination) ###################################################################### from sklearn.feature_selection import RFE model = LogisticRegression(solver='lbfgs', max_iter=500) rfe = RFE(model, n_features_to_select=4) fit = rfe.fit(df_X, df_y) print("Num Features: %d" % fit.n_features_) fs = df_X.columns[fit.support_].tolist() # selected features print("Selected Features: %s" % fs) #print("Feature Ranking: %s" % fit.ranking_) scores = cross_val_score(model, df_X[fs], df_y, cv=5) print("Acc: "+str(scores.mean())) ###################################################################### # Forward selection ###################################################################### # please install 'mlxtend' moudle from mlxtend.feature_selection import SequentialFeatureSelector as SFS
plt.xlabel('Feature1') plt.ylabel('Frequency of Feature1') plt.show() plt.savefig('Frequency of Feature1') #Feature Selection data_final_vars = data.columns.values.tolist() y = ['Sickness', 'ID'] Y = ['Sickness'] X = [i for i in data_final_vars if i not in y] print(X, y) logreg = LogisticRegression() rfe = RFE(logreg, 20) rfe = rfe.fit(data[X], data[Y]) print(rfe.support_) print(rfe.ranking_) cols = [ "Feature15", "Feature23", "Feature43", "Feature45", "Feature64", "Feature87", "Feature115", "Feature127", "Feature162", "Feature163",
for rfe_step_idx, rfe_step in enumerate(rfe_step_range): print( str(count_iter) + '/' + str( len(seed_range) * len(nCoeffs_range) * len(rfe_step_range))) for train_index, test_index in skf.split(features, labels): # external CV X_train, X_test = features[train_index], features[test_index] y_train, y_test = labels[train_index], labels[test_index] scaler = MinMaxScaler() sv = LinearSVC() rfe = RFE(sv, step=rfe_step, n_features_to_select=nCoeffs) # Defining scaler + rfe pipe = Pipeline([('std_scaler', scaler), ('fs', rfe)]) clf = GridSearchCV(pipe, param_grid=param_grid, cv=inner_folds, scoring=scoring_fct, n_jobs=6) y_score = clf.fit(X_train, y_train) #print(clf.best_params_) best_model = clf.best_estimator_ selector = best_model.named_steps['fs']
pd.DataFrame(np.transpose(classifier.coef_), columns = ["coef"]) ],axis = 1) #### Feature Selection #### ## Feature Selection # Recursive Feature Elimination from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression # Model to Test classifier = LogisticRegression() # Select Best X Features rfe = RFE(classifier, 20) rfe = rfe.fit(X_train, y_train) # summarize the selection of the attributes print(rfe.support_) print(rfe.ranking_) X_train.columns[rfe.support_] # New Correlation Matrix sn.set(style="white") # Compute the correlation matrix corr = X_train[X_train.columns[rfe.support_]].corr() # Generate a mask for the upper triangle mask = np.zeros_like(corr, dtype=np.bool) mask[np.triu_indices_from(mask)] = True
return np.amax([val1, val2]) # K=2 TITANIC pca_titanic = [] ica_titanic = [] rca_titanic = [] rfe_titanic = [] k=2 for dim in range(1, len(tit_cols)+1): pca = PCA(n_components=dim) ica = FastICA(n_components=dim) rca = GaussianRandomProjection(n_components=dim) logreg = LogisticRegression() rfe = RFE(logreg, n_features_to_select=dim) pca_X_train = pca.fit_transform(tit_X_train) ica_X_train = ica.fit_transform(tit_X_train) rca_X_train = rca.fit_transform(tit_X_train) rfe.fit(tit_X_train, tit_y_train) rfe_X_train = rfe.transform(tit_X_train) em = GaussianMixture(n_components=k) em.fit(pca_X_train) pca_em_X_train = em.predict(pca_X_train) em.fit(ica_X_train) ica_em_X_train = em.predict(ica_X_train) em.fit(rca_X_train) rca_em_X_train = em.predict(rca_X_train) em.fit(rfe_X_train) rfe_em_X_train = em.predict(rfe_X_train)
print("finish") names = [ 'alloy', 'class', 'delta', 'Hmix', 'Smix', 'Fi', 'RMS', 'VEC', 'r', 'Sc', 'deltaHmixmax', 'deltaHmixmin', 'rootHmix', 'rootHmix0', 'rootHmix0+', 'rootHmix0-' ] data = pd.read_csv('合并数据集-去除重复.csv', header=0, names=names) Y = data[["class"]] X = pd.read_csv('generate_feature_1008.csv') print("finish") rfc = RandomForestClassifier() #Y=Y.values #Y= Y.reshape(c, ) rfe = RFE(estimator=rfc, n_features_to_select=1, step=1) rfe.fit(X, Y) ranking = rfe.ranking_ print("RFE ranking:\n", ranking) list_ranking_index = [] list_ranking_importance = [] for i in range(len(ranking)): if ranking[i] <= 100: list_ranking_index.append(i) list_ranking_importance.append(ranking[i]) print("list_ranking_index:\n", list_ranking_index) print("list_ranking_importance:\n", list_ranking_importance) print('finish') #写入CSV
def RFE(self,estimator,k): X=self.X Y=self.Y rfe=RFE(estimator,n_features_to_select=k) res=rfe.fit_transform(X,Y) return rfe,res
X_train=training_data[['X1','X2','X3','X4','X5','X6','X7','X8']] y_train=training_data[['Y']] # step-1: create a cross-validation scheme folds = KFold(n_splits = 10, shuffle = True, random_state = 100) # step-2: specify range of hyperparameters to tune hyper_params = [{'n_features_to_select': list(range(1, 9))}] # step-3: perform grid search # 3.1 specify model lm = LinearRegression() rfe = RFE(lm) # 3.2 call GridSearchCV() model_cv = GridSearchCV(estimator = rfe, param_grid = hyper_params, scoring= 'r2', cv = folds, verbose = 1, return_train_score=False) lr= model_cv.fit(X_train,y_train) y_predict=lr.predict(X_train) print("The coefficient of determination(r squared) obtained from Linear Regression:\n") ######score here returns The coefficient of determination(r squared) the closer to 1 the better model print(lr.score(X_train,y_train),"\n")
# for p in cv: # print p # print len(cv) # sys.exit() ''' Logistic regression ''' # w = 'balanced' # clf = LogisticRegression(class_weight=w, penalty='l1', n_jobs=1) # parameters = {'C': np.hstack((np.arange(0.0095, 0.02, 0.0001), np.arange(0.02, 0.601, 0.005)))} # parameters = {'C': [0.005, 0.0075, 0.01]} # parameters = {'C': [0.005]} clf = Pipeline([ # ('rfe', RFE(estimator=LogisticRegression(class_weight='balanced', penalty='l1', C=0.01), n_features_to_select=2, ('rfe', RFE(estimator=LogisticRegression(class_weight='balanced', penalty='l1', C=0.001), n_features_to_select=2, step=0.1)), ('clf', LogisticRegression(class_weight='balanced', penalty='l1', n_jobs=1)) ]) # parameters = {'clf__C': [0.005, 0.0075, 0.01]} parameters = {'clf__C': [0.001, 0.01]} K = 5 R = 1 # repeat cross-validation auc_limit = 0.55 auc_hat = 1 step_remove = 1 # TODO # TODO
os_data_X,os_data_y=os.fit_sample(X_train, y_train) os_data_X = pd.DataFrame(data=os_data_X,columns=columns ) os_data_y= pd.DataFrame(data=os_data_y,columns=['IntermittentIceCover']) # we can Check the numbers of our data print("length of oversampled data is ",len(os_data_X)) print("Number of annual lakes",len(os_data_y[os_data_y['IntermittentIceCover']==0])) print("Number of intermittent lakes",len(os_data_y[os_data_y['IntermittentIceCover']==1])) print("Proportion of annual lakes in oversampled data is ",len(os_data_y[os_data_y['IntermittentIceCover']==0])/len(os_data_X)) print("Proportion of intermittent lakes in oversampled data is ",len(os_data_y[os_data_y['IntermittentIceCover']==1])/len(os_data_X)) dt_vars=dt.columns.values.tolist() y=['IntermittentIceCover'] X=[i for i in dt_vars if i not in y] logreg = LogisticRegression() rfe = RFE(logreg, 20) rfe = rfe.fit(os_data_X, os_data_y.values.ravel()) print(rfe.support_) print(rfe.ranking_) cols=[ "MeanAnnualAirTemp_c", "MaximumDepth_m", 'Latitude_dd', 'temp_range'] #cols=[ "Elevation_m", "MeanAnnualAirTemp_c", "MaximumDepth_m", 'Latitude_dd'] X=os_data_X[cols] y=os_data_y['IntermittentIceCover'] logit_model=sm.Logit(y,X) result=logit_model.fit() print(result.summary2()) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)
def train_svm_k_fold_RFE(matrix, target, gamma, linear=True, nfeatures=15, nsplits=10, penalty="l2", C=1, multi_class="ovr", kernel="rbf", degree=3, probability=False, decision_function_shape="ovr"): scores = [] confusion = [] features = [] parameters = { "Gamma": gamma, "Linear": linear, "C": C, "Kernel": kernel, "Degree": degree, "Average": [], "Scores": [], "Features": [], "Macro": [], "Micro": [], "Weighted": [] } if (linear): best_svc = LinearSVC(penalty="l2", C=C, multi_class="ovr") else: best_svc = SVC(C=C, kernel=kernel, gamma=gamma, degree=degree, probability=probability, decision_function_shape=decision_function_shape) cv = KFold(n_splits=nsplits, random_state=42, shuffle=False) for train_index, test_index in cv.split(matrix): #print("Train Index: ", train_index, "\n") #print("Test Index: ", test_index) X_train, X_test, y_train, y_test = matrix[train_index], matrix[ test_index], target[train_index], target[test_index] # ---------------- FEATURE SELECTION ------------------------ rforest = RandomForestClassifier(random_state=101) rfe = RFE(estimator=rforest, n_features_to_select=nfeatures) rfe.fit(X_train, y_train) support = rfe.support_ j = 0 indexes = [] for i in support: if i == True: indexes.append(j) j += 1 x_train_fs = X_train[:, indexes] # --------------- TRAINING ------------------------------ # Training the model best_svc.fit(x_train_fs, y_train) #--------------- TESTING ------------------------------- # Getting the scores of the model on the test set svc_predictions = best_svc.predict(X_test[:, indexes]) # getting accuracy scores.append(best_svc.score(X_test[:, indexes], y_test)) # Macro parameters["Macro"].append( precision_recall_fscore_support(y_test, svc_predictions, average='macro')) # Micro parameters["Micro"].append( precision_recall_fscore_support(y_test, svc_predictions, average='micro')) # Weighted parameters["Weighted"].append( precision_recall_fscore_support(y_test, svc_predictions, average='weighted')) parameters["Features"].append(indexes) # getting confusion matrix confusion.append(confusion_matrix(y_test, svc_predictions)) parameters["Scores"].append(scores) parameters["Average"] = np.average(scores) return (scores, confusion, parameters)
from sklearn.pipeline import Pipeline from sklearn.preprocessing import LabelEncoder, StandardScaler from sklearn.model_selection import (cross_val_score, KFold, cross_validate, train_test_split) from sklearn.ensemble import StackingClassifier data = load_wine() y = data.target X = data.data stc = StandardScaler() lenc = LabelEncoder() columns = data.feature_names df = pd.DataFrame(data=np.hstack(tup=(X, y.reshape(-1, 1))), columns=np.hstack(tup=(columns, ["Class"]))) X_std = stc.fit_transform(df[columns]) pipesvm = Pipeline([("stc", stc), ("selection", RFE(LinearSVC())), ("svm", SVC(kernel="linear"))]) pipelda = Pipeline([("stc", stc), ("svm", LinearDiscriminantAnalysis())]) estimators = [("LDA", pipelda), ("SVM", pipesvm)] # El utilizar clasificadores apilados tiene beneficios cuando se trata de # problemas multiclase, puesto que puede mejorar mucho el pronostico de clase # al explotar el poder predictivo del pronostico para ciertas clases stacking_classifier = StackingClassifier(estimators=estimators, final_estimator=GaussianNB()) print("Stacking stimators") print( cross_val_score(X=df[columns], y=y, estimator=stacking_classifier, cv=KFold(5))) print("Only SVM")
del X['target'] del X['id'] X.describe() from sklearn import preprocessing le = preprocessing.LabelEncoder() le.fit(Y.values.tolist()) label = le.transform(Y) print(list(le.classes_)) print(label) noOfFeature = 45 from sklearn.feature_selection import RFE from sklearn.ensemble import RandomForestClassifier import timeit start = timeit.default_timer() clf = RandomForestClassifier() rfe = RFE(clf, noOfFeature) fit = rfe.fit(X, label) print("Time take %.2f " % (timeit.default_timer() - start)) print(("Num Features: %d") % fit.n_features_) print(("Selected Features: %s") % fit.support_) print(("Feature Ranking: %s") % fit.ranking_) features = [] for i, j in zip(X.columns, fit.support_): if j == True: features.append(str(i)) print(features) from sklearn.model_selection import cross_val_score import timeit from xgboost import XGBClassifier from statistics import mean train_csv = pd.read_csv('../input/train.csv')
X = fifa.drop('Overall', 1) y = fifa['Overall'] lr_model = LinearRegression() rfe = RFE(lr_model, n_features_to_select=5) rfe.fit(X, y) mask = rfe.support_ top_features = X.columns[mask] return list(top_features) # In[117]: q4() # In[118]: X = fifa.drop('Overall', 1) y = fifa['Overall'] lr_model = LinearRegression() rfe = RFE(lr_model, n_features_to_select=5) rfe.fit(X, y) plt.figure() plt.title("Feature Importance") pd.Series(rfe.estimator_.coef_, index=X.columns[rfe.support_]).sort_values().plot(kind='barh')
from sklearn.feature_selection import RFE from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB from sklearn.pipeline import make_pipeline, make_union from tpot.builtins import StackingEstimator from xgboost import XGBClassifier # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.86 exported_pipeline = make_pipeline( StackingEstimator(estimator=XGBClassifier(learning_rate=0.1, max_depth=3, min_child_weight=16, n_estimators=100, nthread=1, subsample=0.25)), RFE(estimator=ExtraTreesClassifier(criterion="entropy", max_features=0.8, n_estimators=100), step=0.7000000000000001), GaussianNB()) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
def logitRegression(data): # Feature Selection logistic = LogisticRegression() rfe = RFE(logistic, 18) rfe = rfe.fit(inputs, winners) print(rfe.support_) print(rfe.ranking_) features = rfe.support_ print("\nFeature index: " + str(np.where(features == True))) # creating testing and training set X_train, X_test, Y_train, Y_test = train_test_split(inputs, winners, test_size=0.33) # train scikit learn model clf = LogisticRegression() clf.fit(X_train, Y_train) score = round(clf.score(X_test, Y_test), 2) print('score Scikit learn: ', score) logistic.fit(inputs, winners) predicted = logistic.predict(X_test) print("Predicted: " + str(predicted)) plt.figure() plt.plot(predicted) # Metrics: confusion matrix cm = metrics.confusion_matrix(Y_test, predicted) print(cm) # plot plt.figure(figsize=(2, 2)) sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap='Blues_r') plt.ylabel('Actual label') plt.xlabel('Predicted label') all_sample_title = 'Accuracy Score: {0}'.format(score) plt.title(all_sample_title, size=15) plt.show() # cross validation kfold = sklearn.cross_validation.KFold(X_train.shape[0], n_folds=10) modelCV = LogisticRegression() scoring = 'accuracy' results = sklearn.metrics.accuracy_score(Y_test, predicted) print("\n\n 10-fold cross validation average accuracy: %.3f" % (results.mean())) print("\n") # precision print(classification_report(Y_test, predicted)) # ROC logit_roc_auc = roc_auc_score(Y_test, logistic.predict(X_test)) fpr, tpr, thresholds = roc_curve(Y_test, logistic.predict_proba(X_test)[:, 1]) plt.figure() plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc) plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.legend(loc="lower right") plt.savefig('Log_ROC') plt.show() # train with selected features train_cols = [ 'Action 2', 'Action 9', 'Action 10', 'Action 11', 'Action 12', 'Action 13', 'Action 14', 'Action 16', 'Action 18', 'Action 24', 'Action 32', 'Action 41', 'Action 48', 'Action 53', 'Action 57', '2gram 10', '3gram 2', '3gram 9' ] X = data[train_cols] #print(X) y = data['Winner'] logit_model = sm.Logit(y.astype(float), X.astype(float)) result = logit_model.fit(method='bfgs') print(result.summary())
# In[ ]: from sklearn.linear_model import LogisticRegression from sklearn.feature_selection import RFE cols = [ "Age", "Fare", "TravelAlone", "Pclass_1", "Pclass_2", "Embarked_C", "Embarked_S", "Sex_male", "IsMinor" ] X = final_train[cols] y = final_train['Survived'] # Build a logreg and compute the feature importances model = LogisticRegression() # create the RFE model and select 8 attributes rfe = RFE(model, 8) rfe = rfe.fit(X, y) # summarize the selection of the attributes print('Selected features: %s' % list(X.columns[rfe.support_])) # <a id="t4.1.2."></a> # ### 4.1.2. Feature ranking with recursive feature elimination and cross-validation # # RFECV performs RFE in a cross-validation loop to find the optimal number or the best number of features. Hereafter a recursive feature elimination applied on logistic regression with automatic tuning of the number of features selected with cross-validation. # In[ ]: from sklearn.feature_selection import RFECV # Create the RFE object and compute a cross-validated score. # The "accuracy" scoring is proportional to the number of correct classifications rfecv = RFECV(estimator=LogisticRegression(),