def main(): train_df = munge_data('./data/train.csv', False) train_df = train_df.drop('PassengerId', axis=1) target_df = train_df['Survived'] train_df = train_df.drop('Survived', axis=1) train_df = train_df.sort(axis=1) test_df = munge_data('./data/test.csv') test_ids = test_df.PassengerId.values test_df = test_df.drop('PassengerId', axis=1) test_df = test_df.sort(axis=1) train_data = train_df.values target_data = target_df.values test_data = test_df.values clf = svm.SVC(kernel='linear') selector = RFECV(clf, step=1, cv=5, scoring='accuracy') train_data, cx_data, target_data, cx_target_data = cross_validation.train_test_split( train_data, target_data, test_size=0.2) selector = selector.fit(train_data, target_data) print(selector.score(cx_data, cx_target_data)) cx_predictions = selector.predict(cx_data) print(classification_report(cx_target_data, cx_predictions)) predictions = selector.predict(test_data) with open('output.csv', 'w') as o: o.write('PassengerId,Survived\n') for passenger, prediction in zip(test_ids, predictions): o.write('{},{}\n'.format(passenger, prediction))
def featureSelection(matrix, survival, genes): #train test split matrix_train, matrix_test, survival_train, survival_test = train_test_split( matrix, survival, test_size=0.2, random_state=42) clf = BernoulliNB() clf.fit(matrix_train, survival_train) print("bernoulli classification accuracy") classificationAccuracy(matrix_test, survival_test) estimator = BernoulliNB() selector = RFECV(estimator, step=50, verbose=1) selector = selector.fit(matrix_train, survival_train) print(selector.ranking_) print(selector.predict(matrix_train)) print(selector.predict(matrix_test)) print("train data classification accuracy") classificationAccuracy(selector.predict(matrix_train), survival_train) print("test data classification accuracy") classificationAccuracy(selector.predict(matrix_test), survival_test) #PRECISION AND RECALL selector.transform(matrix_test) survival_score = selector.predict(matrix_test) average_precision = average_precision_score(survival_test, survival_score) print('Average precision-recall score: {0:0.2f}'.format(average_precision)) disp = plot_precision_recall_curve(selector, matrix_test, survival_test) disp.ax_.set_title('Precision-Recall curve: ' 'AP={0:0.2f}'.format(average_precision)) plt.show() #GENE SELECTION #gene_indices = matrix[np.any(cdist(matrix[:,1:], matrix_train)==0, axis=1)] i = 0 x = [] while i < len(selector.ranking_): if selector.ranking_[i] == 1: try: entrez = cbioportal.Genes.getGeneUsingGET( geneId=genes[i]).result() mutation = cbioportal.Mutations.getMutationsInMolecularProfileBySampleListIdUsingGET( entrezGeneId=entrez.entrezGeneId, molecularProfileId='brca_tcga_pan_can_atlas_2018_mutations', sampleListId='brca_tcga_pan_can_atlas_2018_all').result() if len(mutation) > 20: x.append(genes[i]) except: pass i = i + 1 print("training genes {} ".format(x))
def call_function(): try: dataset = loadtxt("/".join([DATASET_FOLDER, 'heart.data']), delimiter=",") # split data into X and y X = dataset[:, 0:np.array(dataset).shape[1] - 1] Y = dataset[:, np.array(dataset).shape[1] - 1] X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.23, random_state=22) #use linear regression as the model xg = XGBClassifier() #rank all features, i.e continue the elimination until the last one rfe = RFECV(xg, cv=5, step=1) rfe.fit(X_train, y_train) y_important_pred = rfe.predict(X_test) print("Features sorted by their rank:") print(sorted(zip(map(lambda x: x, rfe.ranking_)))) print(sorted(zip(map(lambda x: x, rfe.support_)))) print(accuracy_score(y_test, y_important_pred.round()) * 100) except: e = sys.exc_info()[0] print("<p>Error: %s</p>" % e)
def random_forest(self, data, labels): if self.verbose: logging.info('Implementing Random Forest Classifier...') train, test, train_labels, test_labels = train_test_split( data.values, labels, test_size=self.test_size, random_state=self.random_state) train_scaled, test_scaled = scaleData(train, test) # print(f'Ravel shit RF: {train_labels.values.ravel()}\n') # print(f'Train type: {type(train)}') # print(f'Ravel type: {type(train_labels.values.ravel())}') rfc = RandomForestClassifier(random_state=101) # rfecv = RFECV(estimator=rfc, step=1, cv=StratifiedKFold(10), scoring='accuracy') rfecv = RFECV(estimator=rfc, step=1, cv=7, scoring='accuracy') rfecv.fit(train_scaled, train_labels.values.ravel()) if self.verbose: logging.info('Optimal number of features: {}'.format(rfecv.n_features_)) if self.verbose == 2: showGraph(rfecv.grid_scores_) prediction = rfecv.predict(test_scaled) score = accuracy_score(test_labels, prediction) if self.verbose: logging.info(f'RF Predictions Complete') logging.info('Score: {:0.2f}%\n'.format(score*100)) if score*100 >= self.acceptance and self.save: savePickle(rfecv, 'RF', score*100, self.sampleNumber) return score import numpy as np
def benchmark_features_selection(clf,name): print('_' * 80) print("Training: ") print(clf) t0 = time() rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(y_train, 2), scoring='accuracy') rfecv.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) print(name+"Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() t0 = time() pred = rfecv.predict(X_test) test_time = time() - t0 print("test time: %0.3fs" % test_time) if hasattr(clf, 'coef_'): print("dimensionality: %d" % clf.coef_.shape[1]) print("density: %f" % density(clf.coef_)) print("Saving data to database:") save_results_data(cursor, name, testing_identifiant_produit_list, pred) print() clf_descr = str(clf).split('(')[0] return clf_descr,train_time,test_time
def test_model(model, xtrain, ytrain, feature_list, prefix): """ use train_test_split to create validation train/test samples """ xTrain, xTest, yTrain, yTest = train_test_split(xtrain, ytrain, test_size=0.4) if DO_RFECV: model.fit(xtrain, ytrain) if hasattr(model, 'coef_'): model = RFECV(estimator=model, verbose=0, step=1, scoring=score_fn, cv=3) model.fit(xTrain, yTrain) print 'score', model.score(xTest, yTest) ypred = model.predict(xTest) ### don't allow model to predict negative number of orders if any(ypred < 0): print ypred[ypred < 0] ypred[ypred < 0] = 0 print 'RMSE', np.sqrt(mean_squared_error(ypred, yTest)) # debug_output(model, feature_list) debug_plots(model, yTest, ypred, prefix) return
def predict(X_test, clf_object, x_train, y_train): rfecv = RFECV(estimator=clf_object, step=1, cv=5, scoring='accuracy') #5-fold cross-validation rfecv = rfecv.fit(x_train, y_train) y_pred = rfecv.predict(X_test) return y_pred, rfecv
def get_betareg_model(df_claims, test_size, seed): df_claims = replace_nans(df_claims) X_Train, X_Test, Y_Train, Y_Test = get_train_test(df_claims, test_size, seed) encoders = get_encoders() # Using separate pipelines for transformer and estimator due to RFECV's bug #6321 transformer_pipe = Pipeline(encoders) linear_model = RFECV(estimator=LinearRegression(), scoring='neg_mean_squared_error', step=1, cv=5) transformer_pipe.fit(X_Train) X_Train_transformed = transformer_pipe.transform(X_Train) X_Test_transformed = transformer_pipe.transform(X_Test) linear_model.fit(X_Train_transformed, Y_Train) linear_preds = linear_model.predict(X_Test_transformed) result = { 'lreg_model': linear_model, 'lreg_preds': linear_preds, 'transformer': transformer_pipe, 'features': get_transformed_column_names(X_Train) } return result
def train_test(X_train, Y_train, X_test, Y_test, cv_params, custom_grid=False): if custom_grid: random_grid = load_grid(custom_grid) else: alpha = np.linspace(30000, 20000, 500) #solver = ['svd', 'cholesky', 'lsqr'] # Create the random grid random_grid = {'alpha': alpha} #'solver' : solver} print_grid(random_grid) estimator = Ridge(alpha=90000) ridge_random = RFECV(estimator, step=500, cv=5, verbose=10) # Random search of parameters, using 3 fold cross validation, # search across 100 different combinations, and use all available cores #ridge_random = RandomizedSearchCV(selector, param_distributions = random_grid, n_iter = cv_params["n_iter"], # cv = cv_params["cv"], verbose=10, random_state=42, n_jobs = cv_params["n_jobs"], # pre_dispatch='2*n_jobs') ridge_random.fit(X_train, Y_train) best_grid_params = {'alpha': 30000} best_random = ridge_random.get_support() best_model_params = ridge_random.get_params() train_predictions = ridge_random.predict(X_train) test_predictions = ridge_random.predict(X_test) #metrics r_train = pearsonr(Y_train, train_predictions) r_test = pearsonr(Y_test, test_predictions) mse_train = mse(Y_train, train_predictions) mse_test = mse(Y_test, test_predictions) metrics = { "r_train": r_train, "r_test": r_test, "mse_train": mse_train, "mse_test": mse_test } print(f"pearsonr train: {r_train}") print(f"pearsonr test: {r_test}") print(f"mse train: {mse_train}") print(f"mse test: {mse_test}") print(best_model_params) return best_grid_params, best_model_params, train_predictions, test_predictions, metrics, {}
def data_prediction(): train, test = data_preprocessing() X = train.drop(columns=['gender']) y = train['gender'] print('[INFO]....trainset shape: ', X.shape) print('[INFO]....testset shape: ', test.shape) encoding_columns = ['first_item_browsed'] X, test = category_encoding(encoding_columns, 0.2, X, y, test) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=123) ##########################FOR BASE LGBM############################################ '''model = lgb.LGBMClassifier() model.fit(X_train,y_train) print('score on validation data: ',model.score(X_test,y_test)) final_pred = model.predict(test)''' ##########################FOR LGBM USING RFECV######################################### print('[INFO]....Creating an LGBM model') print('[INFO]....Applying RFECV to select 150 features') model = lgb.LGBMClassifier() model = RFECV(estimator=model, step=10, min_features_to_select=150, scoring='accuracy') model.fit(X_train, y_train) X_train = model.transform(X_train) X_test = model.transform(X_test) test = model.transform(test) print('[INFO]....After tranformation train shape :', X_train.shape) model = lgb.LGBMClassifier() model.fit(X_train, y_train) print('score on validation data: ', model.score(X_test, y_test)) final_pred = model.predict(test) ###########################FOR STACKING PURPOSE ############################################ '''basemodel_1,basemodel_2,basemodel_3,meta_model = stacking_models(X_train,X_test,y_train,y_test) base_pred_test = np.column_stack((basemodel_1.predict_proba(test)[:,1],basemodel_2.predict_proba(test)[:,1],\ basemodel_3.predict_proba(test)[:,1])) final_pred = meta_model.predict(base_pred_test)''' ###########################FOR NEURAL NETWORK PURPOSE############################################# #model = neural_net(X_train,y_train,X_train.shape[1]) #pd.Series(dict(zip(X.columns.tolist(),model.feature_importances_))).sort_values(ascending=False).head(20).plot(kind='bar') return (final_pred)
def select_rfecv(): data, x, y = data_drop() x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42) clf_rf = RandomForestClassifier() rfecv = RFECV(estimator=clf_rf, step=1, cv=5, scoring='accuracy') rfecv = rfecv.fit(x_train, y_train) print('best number: ', rfecv.n_features_) print('best features: ', x_train.columns[rfecv.support_]) ac = accuracy_score(y_test, rfecv.predict(x_test)) print('Acc is: ', ac) cm = confusion_matrix(y_test, rfecv.predict(x_test)) sns.heatmap(cm, annot=True, fmt='d') plt.figure() plt.xlabel('Number of features selected') plt.ylabel('cross validation score of number of selected features') plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show()
def RFE_score(model, X, y): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1024, stratify=y) selector = RFECV(model, cv=3, scoring='f1') selector.fit(X_train, y_train) y_pred = selector.predict(X_test) score = f1_score(y_test, y_pred) return selector.get_support(indices=True), score
class rfe_LBC(li_LBC): def fit(self, X, Y): params = self.get_params() model = li_LBC(**params) self.rfe = RFECV(model) self.rfe.fit(X, Y) def predict(self, X): return self.rfe.predict(X) def score(self, X, Y): return self.rfe.score(X, Y)
class RFR: def __init__(self, rfe_cv, *args, **kwargs): self.rfe = None self.rfe_cv = rfe_cv self.model = RandomForestRegressor(*args, **kwargs) def fit(self, X, y): Z = numpy.concatenate([X, y.reshape(-1, 1)], axis=1) Z = numpy.array(Z, dtype=numpy.float32) Z[Z == numpy.inf] = numpy.nan Z[Z == -numpy.inf] = numpy.nan X_, y_ = X[~pandas.isna(Z).any(axis=1), :], y[~pandas.isna(Z).any( axis=1)] if Z.shape[0] != X.shape[0]: print( 'FIT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}' .format(X.shape[0] - X_.shape[0])) if self.rfe_cv: self.rfe = RFECV(self.model) self.rfe.fit(X_, y_) else: self.model.fit(X_, y_) def predict(self, X): Z = numpy.concatenate([X], axis=1) Z = numpy.array(Z, dtype=numpy.float32) Z[Z == numpy.inf] = numpy.nan Z[Z == -numpy.inf] = numpy.nan nan_mask = ~pandas.isna(Z).any(axis=1) X_ = X[nan_mask, :] if Z.shape[0] != X.shape[0]: print( 'PREDICT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}' .format(X.shape[0] - X_.shape[0])) Z = numpy.full(shape=(X.shape[0], 1), fill_value=numpy.nan, dtype=numpy.float64) if self.rfe_cv: Z[nan_mask, :] = self.rfe.predict(X_).reshape(-1, 1) else: Z[nan_mask, :] = self.model.predict(X_).reshape(-1, 1) return Z def set_params(self, **kwargs): self.model.set_params(**kwargs) @property def feature_importances_(self): return self.model.feature_importances_
class SupM1DScikit(SupM1D): def __init__(self, _model, rfe_enabled=False, grid_cv=None, *args, **kwargs): self.rfe = None self.rfe_enabled = rfe_enabled self.grid = None self.grid_cv = grid_cv self._model = _model self.model = self._model(*args, **kwargs) def _fit(self, X, y): Z = numpy.concatenate([X, y], axis=1) Z = numpy.array(Z, dtype=numpy.float32) Z[Z == numpy.inf] = numpy.nan Z[Z == -numpy.inf] = numpy.nan X_, y_ = X[~pandas.isna(Z).any(axis=1), :], y[~pandas.isna(Z).any(axis=1)] if Z.shape[0] != X.shape[0]: print('FIT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}'.format(X.shape[0] - X_.shape[0])) if self.grid_cv is not None: self.grid = GridSearchCV(estimator=self.model, param_grid=self.grid_cv) self.grid.fit(X_, y_) self.model = self._model(**self.grid.best_params_) if self.rfe_enabled: self.rfe = RFECV(self.model) self.rfe.fit(X_, y_) elif self.rfe_enabled: self.rfe = RFECV(self.model) self.rfe.fit(X_, y_) else: self.model.fit(X_, y_) def _predict(self, X): Z = numpy.concatenate([X], axis=1) Z = numpy.array(Z, dtype=numpy.float32) Z[Z == numpy.inf] = numpy.nan Z[Z == -numpy.inf] = numpy.nan nan_mask = ~pandas.isna(Z).any(axis=1) X_ = X[nan_mask, :] if Z.shape[0] != X.shape[0]: print('PREDICT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}'.format(X.shape[0] - X_.shape[0])) Z = numpy.full(shape=(X.shape[0], 1), fill_value=numpy.nan, dtype=numpy.float64) if self.rfe_enabled: Z[nan_mask, :] = self.rfe.predict(X_).reshape(-1, 1) else: Z[nan_mask, :] = self.model.predict(X_).reshape(-1, 1) return Z
def ProcessTicker(self, df, y, pred, skipPredict=False): X = df.ix[1:, :-1].values # we want to predict using the last record pred = pred.ix[:, :-1].values y = np.delete(y, (0), axis=0) # normalize the data for X X = preprocessing.StandardScaler().fit_transform(X) # spilt the data for the accuracty calculations X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # get the test classifier clf_test = self._clf_test # fir the test classifier clf_test.fit(X_train, y_train) # predict from the X_test y_pred = clf_test.predict(X_test) # compare y_test o the x_test following the classifier that was trained before on the train data acc = clf_test.score(X_test, y_test) print("accuracy : {}".format(acc)) # calculate the confusion matrix confusionmatrix = confusion_matrix(y_test, y_pred, labels=[-1, 0, 1]) final = [0] if (skipPredict == True): #print(confusionmatrix) return acc, confusionmatrix, final # get the actual classifier clf = self._clf_actual # use the actual classifier and recursive feature elimination to select the best number of features. m = RFECV(clf, scoring='accuracy') # fit the classifier m.fit(X, y) # predict the final recommedation/decision final = m.predict(pred) print("Final:{}".format(final)) return acc, confusionmatrix, final
def main(): dataset = load_my_data() X = dataset.data y = dataset.target - 1 # split # feature normalization X = preprocessing.scale(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) # Create the RFE object and compute a cross-validated score. clf = SVC(kernel='linear') # The "accuracy" scoring is proportional to the number of correct ticks = time.time() # classifications rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(5), scoring='recall_macro') rfecv.fit(X_train, y_train) print('Time Elapse: {}'.format(time.time() - ticks)) print("Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() # confusion matrix y_pred = rfecv.predict(X_test) class_names = ['remission', 'hypomania', 'mania'] plt.figure() plot_confusion_matrix(confusion_matrix(y_test, y_pred), classes=class_names, title='Confusion matrix without normalization') plt.figure() plot_confusion_matrix(confusion_matrix(y_test, y_pred), classes=class_names, normalize=True, title='Confusion matrix')
def execute(self): # create model estimator = LGBMRegressor(boosting_type='gbdt', objective='regression', metric='mae', num_iterations=10000, learning_rate=0.001, num_leaves=350, max_depth=9, min_data_in_leaf=100) selector = RFECV(estimator, step=1, cv=4, scoring="neg_mean_squared_error", verbose=-1) selector.fit(self.partitions.x_train, self.partitions.y_train) # # prediction # y_pred = selector.predict(self.partitions.x_test) # select only the best features selector.fit(self.partitions.x_train[:, selector.support_], self.partitions.y_train) y_pred = selector.predict(self.partitions.x_test[:, selector.support_]) # number of best features self.n_features = selector.n_features_ # which categories are best self.best_features = selector.support_ # rank features best (1) to worst self.feature_ranking = selector.ranking_ return y_pred, self.partitions.y_test
def recursive_feature_elimination_cv(config_learning, config_data): output = open(os.path.expanduser(config_data.get("Learner", "models")) + "/" + "feature_ranks.txt", "w") feature_names = FeatureExtractor.get_features_from_config_file_unsorted(config_data) combination_methods = FeatureExtractor.get_combinations_from_config_file_unsorted(config_data) x_train = read_features_file(config_learning.get('x_train'), '\t') y_train = read_reference_file(config_learning.get('y_train'), '\t') x_test = read_features_file(config_learning.get('x_test'), '\t') estimator, scorers = learn_model.set_learning_method(config_learning, x_train, y_train) scale = config_learning.get("scale", True) if scale: x_train, x_test = scale_datasets(x_train, x_test) rfecv = RFECV(estimator=estimator, step=1, cv=StratifiedKFold(y_train, 2), scoring='accuracy') rfecv.fit(x_train, y_train) feature_list = [] for i, feature_name in enumerate(feature_names): if combination_methods[i] == 'both': feature_list.append(feature_name) feature_list.append(feature_name) else: feature_list.append(feature_name) for i, name in enumerate(feature_list): output.write(name + "\t" + str(rfecv.ranking_[i]) + "\n") output.close() predictions = rfecv.predict(x_test) return predictions
# ---- # Sane? assert X.shape[0] == chunks.shape[0], "X and chunks length mismatch" assert np.sum(chunks == -1) == 0, "Chunks is malformed" # ---- # Classify # ---- # ---- # Using RFE and linear SVM clf = SVC(C=10, kernel="linear") rfecv = RFECV(estimator=clf, step=1, cv=cv, scoring="accuracy") rfecv.fit(X, y) prediction = rfecv.predict(X) print("Optimal feature number {0}".format(rfecv.n_features_)) print("Feature ranks {0}".format(rfecv.ranking_)) accs = accuracy_score(y, prediction) #print(classification_report(y, prediction)) #print("Overall accuracy: {0}, Chance: {1}.".format( # np.round(accuracy_score(y, prediction), decimals=2), # 1.0/len(np.unique(y)))) # ---- # ---- # Using GradientBoostingClassifier #clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, # max_depth=1, random_state=0) #accs = cross_val_score(clf, X, y=y, scoring="accuracy", cv=cv, # n_jobs=1, verbose=0,
def predictAndPlot(data, header, features, name): print "\n%s" % name # First reduce the data to relevant features. features_plus_date = np.hstack((0, features)) analyzed_data = data[:, features_plus_date] # Remove rows with missing data. for i in range(len(analyzed_data[0])): analyzed_data = analyzed_data[analyzed_data[:, i] != ''] # If it is a retention feature, skip the last X entries. if "retention" in name: if "1d" in name: retention_feature_linesSkipped = 3 elif "3d" in name: retention_feature_linesSkipped = 7 elif "7d" in name: retention_feature_linesSkipped = 15 elif "14d" in name: retention_feature_linesSkipped = 29 elif "28d" in name: retention_feature_linesSkipped = 57 else: retention_feature_linesSkipped = 0 analyzed_data = analyzed_data[:-retention_feature_linesSkipped, :] # The second-last line is # votes. If smaller than 50, skip this entry. # analyzed_data = analyzed_data[analyzed_data[:, -2].astype(float) >= min_daily_regs] # I added the date to simply for plotting reasons. Just in case. Could be removed if not needed. dates = analyzed_data[:, 0] # Set best model and best score default values. best_model = "" best_score = -100 # Iterate through all models to obtain the best parameters and features via cross validation for model_type in list_of_models: # Get training data X and y. X = analyzed_data[:, 1:-1].astype(float) # Ignore dates (first column) and "y" (last column) y = analyzed_data[:, -1].astype(float) model = define_model(model_type) # Set model parameters based on model_type # Perform differently depending on which model is used. # Random Forest has to be treated differently because it doesn't support RFECV. if model_type == "RF": to_be_used_threshold = "median" # Default value. Will be overwritten. score = -100. # Loop through different thresholds. Use the one with the highest score. for model_threshold in ("10.*median", "3.*median", "1*median", "0.3*median", "0.1*median", "0.03*median"): try: # Use only the "model_threshold" best features. model.fit(X, y) X_new = model.transform(X, threshold=model_threshold) header_new = model.transform(header[features][:-1], threshold=model_threshold) # Fit the model again with reduced features X_new and return out of bag score. model.fit(X_new, y) rf_score = model.oob_score_ # I try to keep the amount of features as small as possible. # The rf_score of a model with more features needs to be 2% better to justify more params. # In some cases the score is negative so it also needs to be better overall. if (rf_score > score * 1.02) and (rf_score > score): score = rf_score to_be_used_threshold = model_threshold except: # Just a debug output. print "There was an error at model threshold: %s" % model_threshold print "Score is %2.3f with threshold: %s" % (score, to_be_used_threshold) elif model_type in ("ElasticCV", "Elastic", "linear", "LassoCV"): selector = RFECV(model) selector = selector.fit(X, y) header_new = header[features][:-1] score = selector.score(X, y) print "Score is %2.3f with model: %s" % (score, model_type) else: print "Something went wrong!" if score > best_score: best_score = score best_model = model_type print "Best score is %2.3f with model: %s" % (best_score, best_model) # Predict using the best model, parameters and features, obtained before. model_type = best_model model = define_model(model_type) if model_type == "RF": # In some rare cases the model does not work, because all features were discarded. # Therefore try to do it again without a threshold, that should always work (model_threshold). try: model.fit(X, y) X_new = model.transform(X, threshold = to_be_used_threshold) header_new = model.transform(header[features][:-1], threshold=to_be_used_threshold) model.fit(X_new, y) prediction = model.predict(X_new) score = model.oob_score_ except: print "Fitting the model didn't work! The prediction might be sub-optimal. \nThreshold: %s" % model_threshold model.fit(X, y) prediction = model.predict(X) #score = model.oob_score_ score = 0 elif model_type in ("ElasticCV", "Elastic", "linear", "LassoCV"): selector = RFECV(model) selector = selector.fit(X, y) header_new = header[features][:-1] prediction = selector.predict(X) score = selector.score(X, y) else: print "lol!" # Now derive the importances respectively feature coefficients. try: # This only works with "RF" importances = model.feature_importances_ importances_list = np.vstack((importances, header_new)) importances_list = np.transpose(importances_list) importances_list = importances_list[importances_list[:, 0].astype(float).argsort()][::-1] except: # This should work with all other models. try: X_new = selector.transform(X) header_new = selector.transform(header_new) model.fit(X_new, y) med_value = np.median(X_new, axis=0) med_value[med_value == 0] = np.mean(X_new, axis=0)[med_value == 0] importances = model.coef_ * np.median(X_new, axis=0) importances_list = np.vstack((importances, header_new)) importances_list = np.transpose(importances_list) importances_list = importances_list[importances_list[:, 0].astype(float).argsort()][::1] except: # If the above doesnt work, just give a blank output. importances_list = np.zeros((10, 2)) score = "%s, %s\nOOB Score = %2.2f" % (name, model_type, score) plot_predictionVsActual(prediction, y, score) return prediction, y, dates, importances_list
def rfecv_classifier(method, train_data, train_class, test_data, CV_=3, fraction_feat_to_keep=0.1, LM_params=get_ML_parameters(), save_model=False): n_orig_features = len(list(train_data)) max_ratio_diff = 1.2 global have_written_params_to_file if have_written_params_to_file is False: logging.info("Run settings for models:") logging.info(str(LM_params)) have_written_params_to_file = True # set classifier method clf = set_up_classifier(method, CV_, LM_params) # fit and predict based on whether cross validation is used if (CV_ > 1): step_elim = (1 - fraction_feat_to_keep) / CV_ # Recursive feature elimination with Cross Validation # CV might have issues if data set classification is poorly balanced and can not split it properly try: rfecv = RFECV(estimator=clf, step=step_elim, cv=StratifiedKFold(n_splits=CV_, random_state=0), scoring='accuracy') rfecv.fit(train_data, train_class) preds = rfecv.predict(test_data) current_fraction_features = rfecv.n_features_ / n_orig_features if (current_fraction_features * max_ratio_diff < fraction_feat_to_keep): raise ValueError( "Not enough features kept by RFECV defaulting to RFE") except ValueError: rfecv = RFE(estimator=clf, step=step_elim, n_features_to_select=int(fraction_feat_to_keep * len(list(train_data)))) rfecv.fit(train_data, train_class) preds = rfecv.predict(test_data) mask = list(rfecv.support_) features = train_data.columns features_selected = [ features[i] for i in range(0, len(mask)) if mask[i] ] # sometimes RFECV does not eliminate enough features, so then lets run RFE to remove more if more than 20% over current_fraction_features = len(features_selected) / n_orig_features step_elim = (current_fraction_features - fraction_feat_to_keep) / CV_ if (current_fraction_features > max_ratio_diff * fraction_feat_to_keep) and step_elim > 0: rfecv = RFE(estimator=clf, step=step_elim, n_features_to_select=int(fraction_feat_to_keep * n_orig_features)) rfecv.fit(train_data[features_selected], train_class) preds = rfecv.predict(test_data[features_selected]) mask = list(rfecv.support_) features = train_data.columns features_selected = [ features[i] for i in range(0, len(mask)) if mask[i] ] else: clf.fit(train_data, train_class) preds = clf.predict(test_data) return preds, features_selected, sum(mask)
feature1 = SVD1.fit_transform(feature1) feature2 = SVD2.fit_transform(feature2) feature1 = feature1[labels != 2] feature2 = feature2[labels != 2] labels = labels[labels != 2] feature = np.hstack((feature1, feature2)) from sklearn.feature_selection import RFECV sfk = cv.StratifiedKFold(labels, 10) scores = [] for train, test in sfk: score = [] train_set = feature[train] test_set = feature[test] clf = RFECV( LinearSVC(C=100), cv=cv.StratifiedKFold(labels[train], 10), scoring='f1') clf.fit(train_set, labels[train]) pred = clf.predict(test_set) score.append(accuracy_score(labels[test], pred)) score.append(precision_score(labels[test], pred)) score.append(recall_score(labels[test], pred)) score.append(f1_score(labels[test], pred)) scores.append(score) avg = np.average(scores, axis=0) print avg
if PLOT: plot_n_features_vs_score(rfecv.grid_scores_) # Testing the model print('>> Testing model\n') if DISJOINT_TESTING: TEST_DATA = pd.read_csv(TEST_DATA_PATH, sep='\t', index_col=0).values X_test = TEST_DATA[:, 0:-1] y_test = TEST_DATA[:, -1] # Normalizing the test data X_test = normalizer.transform(X_test) print(f'Cross validation : Stratified {K_FOLD}-Fold\n') print(f'Performance metric used for model optimization : "{SCORING}"\n') # Testing the model with the test set y_pred = rfecv.predict(X_test) # Printing model scores print_scores(y_test, y_pred) # Stopping the timer duration = time() - start print( 'Operation took:', f'{duration:.2f} seconds.\n' if duration < 60 else f'{duration / 60:.2f} minutes.\n') print( f'\nProcess ended at :\n\nDate : {dt.today().strftime("%x")}\nTime : {dt.today().strftime("%X")}\n' ) # Converting a selected section of the dataset_operations to a numpy array (based on best features) data_matrix = DATA[get_selected_features() + ['StepLabel']].values X = data_matrix[:, 0:-1] y = data_matrix[:, -1]
test_size=0.33, random_state=42) l1 = LogisticRegression() l1.fit(X_train, y_train) p1 = l1.predict(X_test) from sklearn.model_selection import StratifiedKFold from sklearn.feature_selection import RFECV l2 = LogisticRegression() rfecv = RFECV(estimator=l2, step=1, cv=StratifiedKFold(2), scoring='accuracy') rfecv.fit(X_train, y_train) print(rfecv.transform(X_train)[:1, :]) print(X_train.head(1)) print('By comparing the two we find the feature not selected') print('Number of best suited features using RFFECV') print(rfecv.n_features_) p2 = rfecv.predict(X_test) from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaler.fit(X_train) scaled_data = scaler.transform(X_train) from sklearn.decomposition import PCA pca = PCA(n_components=1) pca.fit(scaled_data) xtrain_pca = pca.transform(scaled_data) xtest_pca = pca.transform(scaler.transform(X_test)) l3 = LogisticRegression() l3.fit(xtrain_pca, y_train) p3 = l3.predict(xtest_pca) df_comp = pd.DataFrame(pca.components_, columns=X.columns) print('PCA components for the features') print(df_comp)
Y=dataset.iloc[:,-1].values # Splitting the dataset into the Training set and Test set X_train, X_test, y_train ,y_test = train_test_split(X,Y,test_size = 0.2,random_state=0) #Feature Scalling of training and testing data scaler = StandardScaler() scaler.fit(X) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) ##After testing everything we used support vector classifier with RFECV## clfsvm = SVC(kernel = 'linear') selected=RFECV(clfsvm,n_jobs=-1) selected.fit(X_train,y_train) y_pred = selected.predict(X_test) #Accuracy check c = 0 for i in range(X_test.shape[0]): if y_pred[i] != y_test[i]: c = c + 1 print((X_test.shape[0] - c)/X_test.shape[0]*100) #Answer #Importing datasets test_set= pd.read_csv("test.csv") y_test=test_set.values #Feature Scaling y_test = scaler.transform(y_test)
targets = data_train['TARGET'] train_data = data_train.drop(labels=['EID','TARGET'],axis=1) # 划分样本集 train_x,test_x,train_y,test_y = train_test_split(train_data,targets,test_size=0.5,random_state=66) # 设置参数 xgb = XGBClassifier(n_estimators=300,max_depth=5,nthread=20,scale_pos_weight=4,learning_rate=0.07) # 特征选择 rfecv = RFECV(estimator=xgb, step=10, cv=StratifiedKFold(3),n_jobs =20, scoring='roc_auc') rfecv.fit(train_x, train_y) pre_y = rfecv.predict_proba(test_x)[:,1] pre_y_categ = rfecv.predict(test_x) # 计算auc fpr, tpr, thresholds = metrics.roc_curve(test_y, pre_y) auc=metrics.auc(fpr, tpr) f1 = metrics.f1_score(test_y,pre_y_categ) print("AUC得分为:") print(auc) print('f1-score为:') print(f1) print("Optimal number of features :" ) print(rfecv.ranking_ ) print('n_features_') print(rfecv.n_features_) print('support_') print(rfecv.support_) total_time = time() - t0
X1 = parser.iloc[:, 3:len(parser.columns)].values # [4,5,7,8,9] Y1 = parser.iloc[:, 0].values # splitting data set to training set and test set X1_train, X1_test, Y1_train, Y1_test = non_shuffling_train_test_split( X1, Y1, test_size=0.25) # feature scaling sc_X = StandardScaler() X1_train = sc_X.fit_transform(X1_train) X1_test = sc_X.transform(X1_test) # feature selection using recursive feature elimination & training classifer classifier1 = RFECV(SVC(kernel="linear", random_state=0), scoring="accuracy") # classifier1 = RFECV(LogisticRegression(random_state=0),scoring='accuracy') classifier1.fit(X1_train, Y1_train) # predict the test set result Y1_pred = classifier1.predict(X1_test) # to be used in part 2 tested_data, result_part1 = Y1_test, Y1_pred ####### performance of part 1 # confusion Matrix cm = confusion_matrix(tested_data, result_part1) print("confusion_matrix:\n", cm) # accuracy print("accuracy = ", accuracy_score(tested_data, result_part1)) # recall print("recall = ", recall_score(tested_data, result_part1)) # precision print("presicion = ", precision_score(tested_data, result_part1))
ratio = live/len(testData[i]) print("%%live: ",ratio, "| name: ", dataName[i]) importances = rfe.ranking_ indices = np.argsort(importances) print("Feature ranking:") for f in range(n_features): print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) ################## # RANDOM FOREST WITH RFE WITH CV if (fit_data_rfe_cv): for i in range(0, len(testData)): pred = rfe_cv.predict(testData[i]) live = sum(pred) ratio = live/len(testData[i]) print("%%live: ",ratio, "| name: ", dataName[i]) importances = rfe_cv.ranking_ indices = np.argsort(importances) print("Feature ranking:") for f in range(n_features): print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) if(feat_roc): half = int(n_samples / 2) x,y = shuffle(x,y,random_state=random_state) X_train, X_test = x[0:half], x[half:-1]
data_USA_target = data_USA['target'] data_USA.drop(['num','id','target'],axis = 1, inplace = True) data_USA = pd.get_dummies(data_USA, columns= ['cp','restecg','slope','thal','loc']) data_std = Standardize(data_USA) data_std['target'] = data_USA_target print("Data preprocessed...") data = data_std.as_matrix() train_x, test_x, train_y, test_y = train_test_split(data[:, 0:-1], data[:,-1],train_size=0.75) names = list(data_USA.columns.values) print("Executing Recursive Feature Elimination in SVM...") svc = SVC(kernel="linear", C=5) rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(10), scoring='accuracy') rfecv.fit(train_x, train_y) Training_score = rfecv.score(train_x, train_y) predicted= rfecv.predict(test_x) accuracy = accuracy_score(test_y, predicted) print("The support array \n",rfecv.support_) print("The ranking array \n",rfecv.ranking_) print(sorted(zip(map(lambda x: round(x, 4), rfecv.ranking_), names))) print("Training Accuracy is ", Training_score) print("Test Accuracy is ", accuracy) print("The Cross-validation score :" ,max(rfecv.grid_scores_)) print("Optimal number of features : {}" .format(rfecv.n_features_)) # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show()
def main( iterations=1, output_dir='output', one_hot_reform=True, rfecv_eval=True, deterministic=False, grid_search_eval=True, shuffle_holdout=True, plot_rfecv_gridscore=True, optimum_gbr_estimate = True, max_gbr_iterations = 5000, plot_all_gridscores=True, holdout_size=0.15, crossfolds=5, one_hot_reform_categories=ONE_HOT_REFORM_CATEGORIES_, database_path=DATABASE_PATH, target_data_column_name='depAttEff', gbr_parameter_grid_=GBR_PARAMETER_GRID_, gbr_initial_params=GBR_INITIAL_PARAMS_): # input database database_basename = os.path.basename(DATABASE_PATH) # output directory output_dir = os.path.join(output_dir, 'regressor') make_dirs(output_dir) # initialize predicted and holdout tracking rfecv_y_predicted_track = [] rfecv_y_holdout_track = [] # initialize score tracking score_track_mae = [] score_track_r2 = [] rfecv_gridscore_track = [] # initialize feature tracking and ranking feature_track = [] feature_rank = [] training_data = pd.read_csv(database_path) target_data = training_data[target_data_column_name] # make sure to drop the target data from the training data training_data = training_data.drop([target_data_column_name], 1) # initialize the regressor with initial params clf = GradientBoostingRegressorWithCoef(**gbr_initial_params) if one_hot_reform: training_data, _, _ = one_hot_dataframe( training_data, one_hot_reform_categories, replace=True) for run in xrange(iterations): print "run: ", run+1 y_all = np.array(target_data) x_all = training_data.as_matrix() if shuffle_holdout: random_state = _SEED if deterministic else None sss = cross_validation.ShuffleSplit(len(y_all), n_iter=1, test_size=holdout_size, random_state=random_state) for train_index, test_index in sss: x_train, x_holdout = x_all[train_index], x_all[test_index] y_train, y_holdout = y_all[train_index], y_all[test_index] '''The logic is to optimize the parameters for all the features before RFECV''' if grid_search_eval: if optimum_gbr_estimate: # initial params for optimum finding # determine minimum number of estimators with least overfitting opt_gbr = np.arange(max_gbr_iterations) + 0 test_score = heldout_score(clf, x_train, y_train, max_gbr_iterations) test_score /= test_score[0] test_best_iter = opt_gbr[np.argmin(test_score)] print test_best_iter # triple the optimum number of estimators. gbr_parameter_grid_['n_estimators'] = [test_best_iter] # then implement grid search alg. grid_searcher = grid_search.GridSearchCV(estimator=clf, cv=crossfolds, param_grid=gbr_parameter_grid_, n_jobs=-1) # call the grid search fit using the data grid_searcher.fit(x_train, y_train) # store and print the best parameters best_params = grid_searcher.best_params_ else: ''' The logic is that if we don't do grid search, use initial params as 'best' ''' best_params = gbr_initial_params # re-initialize and fit with the "best params" clf = GradientBoostingRegressorWithCoef(**best_params) clf.fit(x_train, y_train) if rfecv_eval: rfecv = RFECV( estimator=clf, step=1, cv=crossfolds, scoring='mean_absolute_error') # perform rfecv fitting rfecv.fit(x_train, y_train) # track predicted y values rfecv_y_predicted = rfecv.predict(x_holdout) rfecv_y_predicted_track.append(rfecv_y_predicted) # track truth y_holdout values rfecv_y_holdout_track.append(y_holdout) # track grid score rankings rfecv_gridscore_track.append(rfecv.grid_scores_) # track MAE performance of estimtor to predict holdout score_track_mae.append(metrics.mean_absolute_error( rfecv_y_predicted, y_holdout)) # track overall r2 performance to predict holdout score_track_r2.append(metrics.r2_score( rfecv_y_predicted, y_holdout)) # create array of feature ranks (contains all featuers) feature_rank.append(rfecv.ranking_) feat_names = np.array(list(training_data), copy=True) # create array of only selected features rfecv_bool = np.array(rfecv.support_, copy=True) sel_feat = list(compress(feat_names, rfecv_bool)) feature_track.append(sel_feat) if plot_rfecv_gridscore and rfecv_eval: plt.plot(rfecv_y_predicted, y_holdout, '+') plt.plot(y_holdout, y_holdout, 'r-') plt.show() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (MAE)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() # Output used to plot the rank of each feature relatively. feature_rank_df = pd.DataFrame(feature_rank) feature_rank_df.columns = feat_names feature_rank_df = feature_rank_df.transpose() feature_rank_df.to_csv('feature_rank_df.csv') # Output used to plot only the best features feature_track = pd.DataFrame(feature_track) feature_track = feature_track.transpose() feature_track.to_csv('feature_track.csv') # overall r2 value for all runs overall_r2 = metrics.r2_score( np.array(rfecv_y_predicted_track).ravel(order='C'), np.array( rfecv_y_holdout_track).ravel(order='C')) # Output to plot the predicted y values rfecv_y_predicted_track = pd.DataFrame(rfecv_y_predicted_track).transpose() rfecv_y_predicted_track.to_csv('rfecv_y_predicted_track.csv') # Output to plot the holdout y values (truth) rfecv_y_holdout_track = pd.DataFrame(rfecv_y_holdout_track).transpose() rfecv_y_holdout_track.to_csv('rfecv_y_holdout_track.csv') # Output used to plot the optimum model MAE score_track_mae = pd.DataFrame(score_track_mae).transpose() score_track_mae.to_csv('score_track_mae.csv') print score_track_mae # Output used to plot the optimum model r2 score_track_r2 = pd.DataFrame(score_track_r2).transpose() score_track_r2.to_csv('score_track_r2.csv') # transpose dataframe for ease of viewing and plotting rfecv_gridscore_track = pd.DataFrame(rfecv_gridscore_track) rfecv_gridscore_track = rfecv_gridscore_track.transpose() rfecv_gridscore_track.to_csv('rfecv_gridscore_track.csv') if plot_all_gridscores: rfecv_gridscore_track.plot(kind='line') plt.show()
def main(): filenameLB = 'mfcc_lb.csv' allsongcat = pickle.load(open('allsongcat.p', 'rb')) #hcdf = pickle.load(open('hcdf_fv.p', 'rb')) with open('mfcc_lb.csv') as f: reader = csv.reader(f) for row in reader: labels = row # select training and test sets ''' TEidx = np.array(random.sample(range(0,1000), 100)) training = [] test = [] trainingLB = [] testLB = [] # make numpy arrays for i in range(1000): if i in TEidx: test.append(featureDict[i]) testLB.append(int(labels[i])) else: training.append(featureDict[i]) trainingLB.append(int(labels[i])) # fit with classifier and predict X = np.array(training) Y = np.array(trainingLB) ''' l=[allsongcat] all_feats = combineFeatures(l) feats_shuf = [] labels_shuf = [] index_shuf = range(len(labels)) shuffle(index_shuf) for i in index_shuf: feats_shuf.append(all_feats[i]) labels_shuf.append(int(labels[i])) X = np.array(feats_shuf) Y = np.array(labels_shuf) kf = KFold(1000, n_folds=3) cla = SVR(kernel="linear") selector = RFECV(cla, step=1, cv=3) selector = selector.fit(X,Y) scores = 0.0 cm_all = np.zeros((10,10), dtype=np.int) for train, test in kf: X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[test] #cla.fit(X_train, y_train) predictions = selector.predict(X_test) scores += zero_one_loss(predictions, y_test) # Compute confusion matrix cm = confusion_matrix(y_test, predictions, labels =[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) np.set_printoptions(precision=2) #print(cm_all) cm_all = np.add(cm_all, cm) print scores/3 plt.figure() plot_confusion_matrix(cm_all) plt.show()
df['SexN']=df['Sex'] df1['SexN']=df1['Sex'] enc=LabelEncoder() df['SexN']=enc.fit_transform(df['Sex']) df1['SexN']=enc.fit_transform(df1['Sex']) X_train=df[['Pclass','SibSp','Parch','Fare','AgeN','SexN']] y_train=df['Survived'] X_test=df1[['Pclass','SibSp','Parch','Fare','AgeN','SexN']] X_test1=df1[['PassengerId','Pclass','SibSp','Parch','Fare','AgeN','SexN']] svc=SVC(kernel='linear') #svc=DecisionTreeClassifier(criterion='entropy') rfecv=RFECV(estimator=svc, step=1, cv=StratifiedKFold(y_train, 5),scoring='accuracy') rfecv.fit(X_train,y_train) predictions=rfecv.predict(X_test) print rfecv.score(X_train,y_train) print("Optimal number of features : %d" % rfecv.n_features_) finlist=zip(X_test1['PassengerId'],predictions) with open("/Users/prakashchandraprasad/Desktop/datasets/Titanic/Decision_tree_titanic7.csv","wb") as f: writer=csv.writer(f) writer.writerow(["PassengerId","Survived"]) writer.writerows(finlist)
#Validate if (options.validate): y_true = train_labels.values[:,1].ravel().astype(int) validate.KFold(train.values[:,4:], y_true) #validate.KFold(train.drop('subject', axis=1).values, y_true) X_train = train.values[:,4:] X_test = test.values[:,4:] pca = PCA(n_components=70) X_train = pca.fit_transform(X_train) X_test = pca.transform(X_test) print 'training' output = "py_{0}.csv".format(submission_id) clf = LinearRegression() y_true = train_labels.values[:,1].ravel().astype(int) rfecv = RFECV(estimator=clf, step=1, cv=StratifiedKFold(y_true, 4), scoring='roc_auc') rfecv.fit(X_train, y_true) plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() preds = rfecv.predict(X_test) submission['Prediction'] = preds submission.to_csv(output,index=False) print 'Done'
X_Perc = SelectPercentile(percentile=50).fit(X, Y) X_selected = X_perc.transform(X) from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression rfeLoR = RFE(LogisticRegression(solver='saga', max_iter=1000), 100) #Sag model works well on large datasets but is sensitive to feature scaling. saga handles sparcity rfeLoR.fit(X, Y) rfeLoR.n_features_ from sklearn.ensemble import RandomForestClassifier from sklearn.feature_selection import RFECV m_RFERFC = RFECV(RandomForestClassifier(n_estimators=100), scoring='accuracy') m_RFERFC.fit(X, Y) # returns model X_RFERFC = m_RFERFC.predict(X) m_RFERFC.score(X, Y) from sklearn.linear_model import LassoCV from sklearn.feature_selection import SelectFromModel m_lasso = SelectFromModel(LassoCV()) m_lasso.fit(X, Y) m_lasso.transform(X).shape X_lasso = m_lasso.transform(X) m_lasso.get_params() mask = m_lasso.get_support() print(mask) plt.matshow(mask.reshape(1, -1), cmap='gray_r') X.columns[mask] #Using CV helps reduce selection bias due to the observations in the training set
for i in range(0, 6 * piece): col.append("max" + str(i + 1)) col.append("mean" + str(i + 1)) col.append("std" + str(i + 1)) train_target = train_data.iloc[:, -1] train_data = train_data[col] model = LogisticRegression(max_iter=20) clf = RFECV(model, step=1, cv=5, n_jobs=-1) clf = clf.fit(train_data, train_target) pured_data = train_data.iloc[:, clf.support_] model = LogisticRegression(max_iter=20) clf = RFECV(model, step=1, cv=5, n_jobs=-1) clf = clf.fit(pured_data, train_target) pred = clf.predict(pured_data) falsePositiveRate, truePositiveRate, thresholds = roc_curve(train_target, pred) confu_mat = pd.crosstab(train_target, pred, rownames=['True'], colnames=['Predicted'], margins=True) print("confusion matrix") print(confu_mat) print("parameters") statLogitModel = sm.Logit(train_target, pured_data).fit_regularized() print(statLogitModel.params) print("P-values") scores, pvalues = chi2(pured_data, train_target) for i in range(len(pvalues)):
f_statistic, p_value, _ = sm.stats.diagnostic.het_goldfeldquandt( y_test, X_test, idx=1, alternative='two-sided') print(p_value) fig = sm.graphics.qqplot(stud_resid, line='45') '''Recursive Feature Elimination with Cross-Validation''' # Scoring functions msle_func = make_scorer(mean_squared_log_error) mse_func = make_scorer(mean_squared_error) # Recursive Feature Elimination estimator = LinearRegression() selector = RFECV(estimator, cv=10) # selector = RFE(estimator, n_features_to_select=20) selector = selector.fit(X_train, np.log(y_train)) y_pred = selector.predict(X_test) # Floor predictions at zero y_pred[y_pred < 0] = y_train.mean() y_pred = np.exp(y_pred) # Scoring rmse_score = np.sqrt(mean_squared_error(y_pred, y_test)) rmsle_score = np.sqrt(mean_squared_log_error(y_pred, y_test)) print('Selected features: {}'.format(X_train.columns[selector.support_])) print('\nRMSE: {}'.format(rmse_score)) print('RMSLE: {}'.format(rmsle_score)) '''Ridge Regression''' # Set the range of hyper-parameters to search params = {'alpha': [1, 0.1, 0.01, 0.001, 0.0001]}
# print test_X.shape, test_Y.shape logistic_reg = LogisticRegression() logistic_reg.fit(train_X, train_Y) print logistic_reg.score(test_X_1, test_Y_1) # test_Y = logistic_reg.predict(test_X) # result.to_csv('result.csv', encoding='utf-8', index=False) Svc = SVC() Svc.fit(train_X, train_Y) print Svc.score(test_X_1, test_Y_1) # test_Y = Svc.predict(test_X) model = RandomForestClassifier(n_estimators=100) model.fit(train_X, train_Y) print model.score(test_X_1, test_Y_1) # test_Y = model.predict(test_X) rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(train_Y, 2), scoring='accuracy') rfecv.fit(train_X, train_Y) print rfecv.score(test_X_1, test_Y_1) test_Y = rfecv.predict(test_X) passenger_id = full[891:].PassengerId test = pd.DataFrame({'PassengerId': passenger_id, 'Survived': test_Y}) print test.shape test.to_csv('pred.csv', index=False)
# Build a classification task using 3 informative features #X, y = make_classification(n_samples=1000, n_features=25, n_informative=3, # n_redundant=2, n_repeated=0, n_classes=8, # n_clusters_per_class=1, random_state=0) # Create the RFE object and compute a cross-validated score. #svc = SVC(kernel="linear") # The "accuracy" scoring is proportional to the number of correct # classifications RF_rfecv = RFECV(estimator=modeltry, step=1, cv=StratifiedKFold(2), scoring='accuracy') RF_rfecv.fit(train_me_X, train_me_y) features = RF_rfecv.get_support(indices=False) RF_rfecv_predict = RF_rfecv.predict(test_me_X) RF_rfecv_accuracy = metrics.accuracy_score(RF_rfecv_predict, test_me_y) print(features) print("Optimal number of features : %d" % RF_rfecv.n_features_) print(RF_rfecv_accuracy) # In[21]: color_function = { 0: "blue", 1: "red" } # Here Red color will be 1 which means M and blue foo 0 means B colors = data["diagnosis"].map(lambda x: color_function.get( x)) # mapping the color fuction with diagnosis column pd.plotting.scatter_matrix(data[features_mean],
def main(): filenameLB = 'mfcc_lb.csv' allsongcat = pickle.load(open('allsongcat.p', 'rb')) #hcdf = pickle.load(open('hcdf_fv.p', 'rb')) with open('mfcc_lb.csv') as f: reader = csv.reader(f) for row in reader: labels = row # select training and test sets ''' TEidx = np.array(random.sample(range(0,1000), 100)) training = [] test = [] trainingLB = [] testLB = [] # make numpy arrays for i in range(1000): if i in TEidx: test.append(featureDict[i]) testLB.append(int(labels[i])) else: training.append(featureDict[i]) trainingLB.append(int(labels[i])) # fit with classifier and predict X = np.array(training) Y = np.array(trainingLB) ''' l = [allsongcat] all_feats = combineFeatures(l) feats_shuf = [] labels_shuf = [] index_shuf = range(len(labels)) shuffle(index_shuf) for i in index_shuf: feats_shuf.append(all_feats[i]) labels_shuf.append(int(labels[i])) X = np.array(feats_shuf) Y = np.array(labels_shuf) kf = KFold(1000, n_folds=3) cla = SVR(kernel="linear") selector = RFECV(cla, step=1, cv=3) selector = selector.fit(X, Y) scores = 0.0 cm_all = np.zeros((10, 10), dtype=np.int) for train, test in kf: X_train, X_test, y_train, y_test = X[train], X[test], Y[train], Y[test] #cla.fit(X_train, y_train) predictions = selector.predict(X_test) scores += zero_one_loss(predictions, y_test) # Compute confusion matrix cm = confusion_matrix(y_test, predictions, labels=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) np.set_printoptions(precision=2) #print(cm_all) cm_all = np.add(cm_all, cm) print scores / 3 plt.figure() plot_confusion_matrix(cm_all) plt.show()
elif rfecv.ranking_[i] == 8: print '8: feature ' + str(i) + ': ' + num_to_name[i] i += 1 ''' # Plot number of features VS. cross-validation scores plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() ''' #now we do prediction #load test data new_X_test, new_y_test = load_svmlight_file('test_svm_data.txt') test_results = rfecv.predict(new_X_test) print test_results i = 0 with open('test_rest_names.txt', 'r') as f: for line in f.readlines(): if test_results[i] == 2: print line + ' will fail in 3 years' else: print line + ' is staying alive' i += 1
#plug it into recursive feature elimination with 10-fold cross-validation and MSE scoring metric rfecv = RFECV(estimator=clf_7, step=1, cv=KFold(10), scoring='neg_mean_squared_error') #pipeline is gonna help us retrieve the feature names #name your classifier and estimator whatever you want, and stick em in tuples pipeline = Pipeline([ ('rfe_cv',rfecv), ('clf',clf_7) ]) #fit that pipe, bro pipeline.fit(X_train, y_train) #how'd we do on the test set? mse = mean_squared_error(y_test, rfecv.predict(X_test)) print("MSE: %.4f" % mse) #how many features did we really need? print("Optimal number of features : %d" % rfecv.n_features_) #the .named_steps attribute from the pipeline can be indexed with whatever you named your RFECV estimator #from there you use the .support_ attribute to help you get the feature names #note: support_feat is just a boolean mask you can apply to the full array of features to get just the ones used by the model support_feat = pipeline.named_steps['rfe_cv'].support_ #aliasing that full array of features feat_names = np.array(list(gbr_df2.drop(['Overall Achievement Score','Overall Achievement Score Scaled','SPG Score Scaled'],axis=1).columns)) #and pulling out the feature names with boolean masking feat_names[support_feat]
scaler = preprocessing.StandardScaler() scaler.fit(explanatory_df2) explanatory_df2 = pandas.DataFrame(scaler.transform(explanatory_df2), columns=explanatory_df2.columns) from sklearn.preprocessing import Imputer numeric_features = df2.ix[:, df2.dtypes != "object"] imputer_object = Imputer(missing_values="NaN", strategy="median", axis=0) imputer_object.fit(numeric_features) numeric_features = pandas.DataFrame(imputer_object.transform(numeric_features), columns=numeric_features.columns) from sklearn.feature_selection import RFECV from sklearn import tree rfe_cv.fit(explanatory_df2, response_series) rfe_cv.predict(explanatory_df2) print "Optimal number of features :{0} of {1} considered".format(rfe_cv.n_features_, len(explanatory_df2.columns)) print rfe_cv.grid_scores_ plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (ROC_AUC)") plt.plot(range(1, len(rfe_cv.grid_scores_) + 1), rfe_cv.grid_scores_) plt.show() features_used = explanatory_df.columns[rfe_cv.get_support()] print features_used # IGNORE #