def classifiers_evaluation(df_res, y): vect = vectorizer(start, end) vect.fit(df_res[1]) classifiers = [ KNeighborsClassifier(3), SVC(probability=True), DecisionTreeClassifier(), ensemble.RandomForestClassifier(), ensemble.AdaBoostClassifier(), ensemble.GradientBoostingClassifier(), GaussianNB(), LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis(), LogisticRegression(), MLPClassifier(), SGDClassifier(loss='log', max_iter=100), LogisticRegressionCV() ] log_cols = ["Classifier", "ROC_AUC score"] log = pd.DataFrame(columns=log_cols) sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=0) acc_dict = {} for train_index, test_index in sss.split(df_res, y): X_train, X_test = df_res.iloc[train_index], df_res.iloc[test_index] y_train, y_test = y[train_index], y[test_index] X_train_ready = vect.transform(X_train[1]) X_test_ready = vect.transform(X_test[1]) del X_train del X_test for clf in classifiers: name = clf.__class__.__name__ clf.fit(X_train_ready, y_train) train_predictions = clf.predict(X_test_ready) acc = accuracy_score(y_test, train_predictions) # acc = roc_auc_score(y_test, train_predictions) if name in acc_dict: acc_dict[name] += acc else: acc_dict[name] = acc del X_train_ready del X_test_ready del y_train del y_test for clf in acc_dict: acc_dict[clf] = acc_dict[clf] / 10.0 log_entry = pd.DataFrame([[clf, acc_dict[clf]]], columns=log_cols) log = log.append(log_entry) print(acc_dict) print(log)
X[:, 13] = enX.fit_transform(X[:, 13]) X[:, 15] = enX.fit_transform(X[:, 15]) X[:, 18] = enX.fit_transform(X[:, 18]) #0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 14, 13, 15, 16 oneencX = OneHotEncoder(categorical_features=[6, 13, 15, 18]) X = oneencX.fit_transform(X).toarray() from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=40) print(y_train) clf = ensemble.GradientBoostingClassifier(learning_rate=0.25, n_estimators=100) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print(y_pred) import pylab as pl cm = metrics.confusion_matrix(y_test, y_pred) pl.matshow(cm) pl.title('Confusion Matrix') pl.colorbar() pl.show() score = metrics.accuracy_score(y_test, y_pred) print(score)
from sklearn.linear_model import Ridge from sklearn.preprocessing import PolynomialFeatures from sklearn import svm, model_selection, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process import seaborn as sns import warnings warnings.filterwarnings('ignore') # In[19]: #list of machine learning algorithms MLA = [ #ensemble method ensemble.AdaBoostClassifier(), ensemble.BaggingClassifier(), ensemble.ExtraTreesClassifier(), ensemble.GradientBoostingClassifier(), ensemble.RandomForestClassifier(), #gaussian processes gaussian_process.GaussianProcessClassifier(), #GLM linear_model.LogisticRegressionCV(), linear_model.PassiveAggressiveClassifier(), linear_model.RidgeClassifierCV(), linear_model.SGDClassifier(), linear_model.Perceptron(), #naive_bayes naive_bayes.BernoulliNB(), naive_bayes.GaussianNB(),
y_train = titanic_train['Survived'] #applying feature selection algorithm to get impactful features rf = ensemble.RandomForestClassifier(n_estimators=100) rf.fit(X_train, y_train) features = pd.DataFrame({ 'feature': X_train.columns, 'importance': rf.feature_importances_ }) features.sort_values(by=['importance'], ascending=True, inplace=True) features.set_index('feature', inplace=True) features.plot(kind='barh', figsize=(20, 20)) fs_model = feature_selection.SelectFromModel(rf, prefit=True) X_train1 = fs_model.transform(X_train) X_train1.shape selected_features = X_train.columns[fs_model.get_support()] #build model using selected features gb_estimator = ensemble.GradientBoostingClassifier(random_state=100) gb_grid = { 'n_estimators': list(range(50, 301, 50)), 'learning_rate': [0.01, 0.05, 0.1] } grid_gb_estimator = model_selection.GridSearchCV(gb_estimator, gb_grid, cv=10) grid_gb_estimator.fit(X_train1, y_train) print(grid_gb_estimator.best_score_) print(grid_gb_estimator.best_params_)
sample_validation_data = sample_validation_data_safe.append(sample_validation_data_risky) sample_validation_data # Import functions for decision tree from functions_decision_tree import * model_5 = decision_tree_create(train_data, list(train_data.columns[1:]), 'safe_loans', 0, max_depth = 6, min_node_size = 0, min_error_reduction=0) for i in xrange(len(sample_validation_data)): print "Case: " + str(i) + " Prediction: " + str(classify(model_5, sample_validation_data.iloc[i], annotate = False)) sample_validation_data[target] # With sklearn gbes = ensemble.GradientBoostingClassifier(n_estimators=5,max_depth=6) model5 = gbes.fit(train_data.drop([target],axis=1), train_data[target]) predictions = model5.predict(sample_validation_data.drop([target],axis=1)) predictions == sample_validation_data[target] predictions_p = model5.predict_proba(sample_validation_data.drop([target],axis=1))[:,1] np.column_stack((predictions,predictions_p)) from sklearn.metrics import accuracy_score accuracy_score(validation_data[target],model5.predict(validation_data.drop([target],axis=1))) # false positives: prediction 1, actual -1 -> 0 predictions = pd.concat([pd.Series(validation_data[target],name='true').reset_index(),pd.Series(model5.predict(validation_data.drop([target],axis=1)),name='predict')],axis=1)
from sklearn import ensemble from math import exp import numpy import matplotlib.pyplot as plt data = pandas.read_csv('gbm-data.csv') y = data['Activity'].values X = data.iloc[:, 1:].values X_train, X_test, y_train, y_test \ = train_test_split(X, y, test_size=0.8, random_state=241) x = [0.2] #[1, 0.5, 0.3, 0.2, 0.1] for el in x: f = [] GBS = ensemble.GradientBoostingClassifier(n_estimators=250, verbose=True, random_state=241, learning_rate=el) GBS.fit(X_train, y_train) o = GBS.staged_decision_function(X_test) for i, y_pred in enumerate(o): y_pred = 1 / (1 + numpy.exp(-y_pred)) ll = log_loss(y_test, y_pred) f.append(ll) plt.figure() plt.plot(f, 'r', linewidth=2) plt.show() clf = RandomForestClassifier(random_state=241, n_estimators=36) clf.fit(X_train, y_train) ll2 = log_loss(y_test, clf.predict_proba(X_test))
trainset_minmax,testset_minmax=scs.scaling('minmax') pca=PCA(n_components=100) pca.fit(trainset_minmax) trainset_minmax_new=pca.transform(trainset_minmax) testset_minmax_new=pca.transform(testset_minmax) dataset=trainset_minmax_new label=scs.trainlabel count0=label.tolist().count(0) count1=label.tolist().count(1) m=np.shape(dataset)[0] trainset1=[dataset[i] for i in xrange(m) if label[i]==1] trainset0=[dataset[i] for i in xrange(m) if label[i]==0] trainset=np.concatenate((trainset0[:count1],trainset1)) trainlabel=np.concatenate((np.zeros((count1,1)),np.ones((count1,1)))) samples=trainset target=trainlabel classifier_GB=ensemble.GradientBoostingClassifier(n_estimators=1000, max_leaf_nodes=4, max_depth= None, random_state= 2,min_samples_split= 5) classifier_GB.fit(samples,target) lables_test_GB=classifier_GB.predict_proba(inX)
# # Cтолбцовая диаграмма, представляющая значимость первых 20 признаков # d_first = 20 # plt.figure(figsize=(8, 8)) # plt.title("Feature importances") # plt.bar(range(d_first), importances[indices[:d_first]], align='center') # plt.xticks(range(d_first), np.array(feature_names)[indices[:d_first]], rotation=90) # plt.xlim([-1, d_first]) # plt.show() # # best_features = indices[:8] # best_features_names = feature_names[best_features] # print(best_features_names) ######################## # Метод GBT: gbt = ensemble.GradientBoostingClassifier(n_estimators=100, random_state=11) gbt.fit(X_train, y_train) err_train = np.mean(y_train != gbt.predict(X_train)) err_test = np.mean(y_test != gbt.predict(X_test)) print(err_train, err_test) # Используем только значимые признаки # gbt = ensemble.GradientBoostingClassifier(n_estimators=100, random_state=11) # gbt.fit(X_train[best_features_names], y_train) # # err_train = np.mean(y_train != gbt.predict(X_train[best_features_names])) # err_test = np.mean(y_test != gbt.predict(X_test[best_features_names])) # print(err_train, err_test)
from sklearn import preprocessing from sklearn.metrics import f1_score from sklearn import ensemble from sklearn.preprocessing import StandardScaler X_train = np.load(open('X_train.npy', 'rb')) Y_train = np.load(open('Y_train.npy', 'rb')) X_test = np.load(open('X_test.npy', 'rb')) Y_test = np.load(open('Y_test.npy', 'rb')) scaler = StandardScaler() X_train_scaler = scaler.fit_transform(X_train) X_test_scaler = scaler.fit_transform(X_test) # We create a instance of model. Estimator = ensemble.GradientBoostingClassifier() # Now, we are going to use a grid search cross-validation to explore combinations of parameters. param_grid = {'n_estimators': [10,20,30],'max_features':['auto', 'log2'],\ 'min_samples_split':[5,10,15], 'max_depth': range(2,15)} Grid_GBoost = GridSearchCV(Estimator, param_grid, cv=10, scoring='f1', verbose=2) Grid_GBoost.fit(X_train, Y_train) # Once it has been fitted, we get several parameters. print("ParameterGrid: ", '\n', list(ParameterGrid(param_grid)), '\n')
import sklearn.neighbors as skneib import sklearn.ensemble as skes import sklearn.tree as sktr import pickle from Prediction.ModelEstimation.getEstimationForMultipleFeatureSets import getEstimationForMultipleFeatureSets randomSeed = 15 random.seed(randomSeed) classifiers = [ skes.RandomForestClassifier(), sktr.DecisionTreeClassifier(), skes.GradientBoostingClassifier(), sklin.RidgeClassifier(), skneib.KNeighborsClassifier() ] #skes.RandomForestClassifier(n_estimators=int(random()*10+1)) tries = 0 while True: classifier = random.choice(classifiers) classifierParameter = 0 if isinstance(classifier, skes.RandomForestClassifier) or isinstance( classifier, sktr.DecisionTreeClassifier) or isinstance( classifier, skes.GradientBoostingClassifier): classifier.min_samples_leaf = random.randint(10, 100) classifierParameter = classifier.min_samples_leaf if isinstance(classifier, sklin.RidgeClassifier):
def fit(self, x, y): scoring = { "roc": make_scorer(roc_auc_score), } x = x.values y = y.values.reshape(len(y), ) if self.method == 'logistic': clf = GridSearchCV(linear_model.LogisticRegression(), param_grid=self.parameters, cv=5, scoring=scoring, refit='roc') clf.fit(x, y) coef = pd.DataFrame(clf.best_estimator_.coef_).sort_values() self.select_col = coef.index[-self.TopN:] elif self.method == 'rf': clf = GridSearchCV(esb.RandomForestClassifier(), param_grid=self.parameters, cv=5, scoring=scoring, refit='roc') clf.fit(x, y) importances = clf.best_estimator_.feature_importances_ Rank = pd.DataFrame(importances, index=x.columns, columns=['importances' ]).sort_values('importances') self.select_col = list(Rank.index[-self.Top_N:]) elif self.method == 'adaBoost': clf = GridSearchCV(esb.AdaBoostClassifier(), param_grid=self.parameters, cv=5, scoring=scoring, refit='roc') clf.fit(x, y) importances = clf.best_estimator_.feature_importances_ Rank = pd.DataFrame(importances, index=x.columns, columns=['importances' ]).sort_values('importances') self.select_col = list(Rank.index[-self.Top_N:]) elif self.method == 'gbm': clf = GridSearchCV(esb.GradientBoostingClassifier(), param_grid=self.parameters, cv=5, scoring=scoring, refit='roc') clf.fit(x, y) importances = clf.best_estimator_.feature_importances_ Rank = pd.DataFrame(importances, index=x.columns, columns=['importances' ]).sort_values('importances') self.select_col = list(Rank.index[-self.Top_N:]) elif self.method == 'xgb': clf = GridSearchCV(XGBRegressor(), param_grid=self.parameters, cv=5, scoring=scoring, refit='roc') clf.fit(x, y) importances = clf.best_estimator_.feature_importances_ Rank = pd.DataFrame(importances, index=x.columns, columns=['importances' ]).sort_values('importances') self.select_col = list(Rank.index[-self.Top_N:])
def train(): test_model = ensemble.GradientBoostingClassifier() person_table, condition_occurrence_table, outcome_cohort_table, measurement_table = util.load_data_set(TRAIN_DIR) measurement_table = util.preprocess_measurement(measurement_table) test_model = util.train_model(test_model,person_table, condition_occurrence_table, measurement_table, outcome_cohort_table) pickle.dump(test_model, open(os.path.join(VOL_DIR,'model.dat'),'wb')) # 데이터 입력
# global_auc = auc(Y, predictions) # print " auc over folds: %0.4f (+/- %0.4f)" % (np.mean(fold_aucs), np.std(fold_aucs)) # print " global auc: %0.4f" % global_auc # tock() # random forest training takes too long lol # classifier training print("Classifier training and evaluation through cross-validation") n_folds = settings['n_folds'] # clf = ensemble.RandomForestClassifier(n_estimators=200, n_jobs=4, verbose=1) clf = ensemble.GradientBoostingClassifier(n_estimators=50, min_samples_split=2, max_features=int(np.sqrt(1000))) # Run classifier with crossvalidation and plot ROC curves cv = cross_validation.StratifiedKFold(Y, n_folds=n_folds) def auc(t, y): fpr, tpr, thresholds = metrics.roc_curve(t, y) return metrics.auc(fpr, tpr) predictions = np.zeros(Y.shape) fold_aucs = [] for i, (train, test) in enumerate(cv): # scores = clf.fit(features[train], Y[train]).decision_function(features[test]) scores = clf.fit(features[train], Y[train]).predict_proba(features[test])[:, 1]
def adaboost_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_options, classification=False): ''' AdaBoostRegressor/Classifier from scikitlearn. ''' if learn_options['adaboost_version'] == 'python': if not learn_options['adaboost_CV']: if not classification: clf = en.GradientBoostingRegressor( loss=learn_options['adaboost_loss'], learning_rate=learn_options['adaboost_learning_rate'], n_estimators=learn_options['adaboost_n_estimators'], alpha=learn_options['adaboost_alpha'], subsample=1.0, min_samples_split=2, min_samples_leaf=1, max_depth=learn_options['adaboost_max_depth'], init=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, random_state=learn_options['seed']) else: clf = en.GradientBoostingClassifier( learning_rate=learn_options['adaboost_learning_rate'], n_estimators=learn_options['adaboost_n_estimators'], subsample=1.0, min_samples_split=2, min_samples_leaf=1, max_depth=learn_options['adaboost_max_depth'], init=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, random_state=learn_options['seed']) clf.fit(X[train], y[train].flatten()) y_pred = clf.predict(X[test])[:, None] else: # optimize the parameters if the adaboosted algorithm if learn_options["algorithm_hyperparam_search"] == "bo": print from hyperopt import hp, fmin, tpe, rand def adaboost_scoring_bo(params): # label_encoder = sklearn.preprocessing.LabelEncoder() # label_encoder.fit(y_all['Target gene'].values[train]) # gene_classes = label_encoder.transform(y_all['Target gene'].values[train]) # n_folds = len(np.unique(gene_classes)) cv = sklearn.cross_validation.KFold( y_all['Target gene'].values[train].shape[0], n_folds=20, shuffle=True) est = en.GradientBoostingRegressor( n_estimators=1000, learning_rate=params['learning_rate'], max_depth=params['max_depth'], min_samples_leaf=params['min_samples_leaf'], max_features=params['max_features'], random_state=learn_options['seed']) scorer = cross_val_score(est, X[train], y[train].flatten(), cv=cv, n_jobs=20) return np.median(scorer) space = { 'learning_rate': hp.uniform('learning_rate', 0.001, 0.1), 'max_depth': hp.quniform('max_depth', 1, 8, 1), 'min_samples_leaf': hp.quniform('min_samples_leaf', 3, 20, 1), 'max_features': hp.uniform('max_features', 0.05, 1.0) } best = fmin(adaboost_scoring_bo, space, algo=tpe.suggest, max_evals=50, verbose=1) print best clf = en.GradientBoostingRegressor( n_estimators=learn_options['adaboost_n_estimators'], learning_rate=best['learning_rate'], max_depth=best['max_depth'], min_samples_leaf=best['min_samples_leaf'], max_features=best['max_features'], random_state=learn_options['seed']) clf.fit(X[train], y[train].flatten()) elif learn_options["algorithm_hyperparam_search"] == "grid": assert not classification, "need to tweak code below to do classificaton, as above" n_jobs = 20 print "Adaboost with GridSearch" from sklearn.grid_search import GridSearchCV param_grid = { 'learning_rate': [0.1, 0.05, 0.01], 'max_depth': [4, 5, 6, 7], 'min_samples_leaf': [5, 7, 10, 12, 15], 'n_estimators': [100, 500, 1000, 2000] } # 'max_features': [1.0, 0.5, 0.3, 0.1]} # param_grid = {'n_estimators': [100, ] # 'learning_rate': [0.1, 0.05, 0.001], # 'max_depth': [4, 7], # 'min_samples_leaf': [5, 15], # 'max_features': [1.0, 0.1]} # label_encoder = sklearn.preprocessing.LabelEncoder() # label_encoder.fit(y_all['Target gene'].values[train]) # gene_classes = label_encoder.transform(y_all['Target gene'].values[train]) n_folds = 10 # len(np.unique(gene_classes)) # cv = sklearn.cross_validation.StratifiedKFold(gene_classes, n_folds=n_folds, shuffle=True) cv = sklearn.cross_validation.KFold(X[train].shape[0], n_folds=n_folds, shuffle=True) est = en.GradientBoostingRegressor( loss=learn_options['adaboost_loss'], random_state=learn_options['seed'] ) #, n_estimators=learn_options['adaboost_n_estimators']) clf = GridSearchCV(est, param_grid, n_jobs=n_jobs, verbose=1, cv=cv, scoring=spearman_scoring, iid=False) clf.fit(X[train], y[train].flatten()) print clf.best_params_ else: raise Exception( "if using adaboost_CV then need to specify grid (grid search) or bo (bayesian optimization)" ) y_pred = clf.predict(X[test])[:, None] else: raise NotImplementedError return y_pred, clf
def DeepBoosting4(X_train, y_train, TrainMethod="GrowDeep", n_estimators=5000000, GrowDeep_max_iterPerDepthNUM=500, GrowDeep_max_depthNUM=50, GrowDeep_max_no_improvement=3, GrowDeep_tol_no_improvement=0.00001, GrowDeep_init_depth=1, AllowGrowDeepRetrain=1, validation_fraction=0.2, n_iter_no_change=5, tol=0.01, tolAdjust=1, LossEarlyStop="logloss", random_state=0, FixedDepth_max_depth=50, learning_rate=0.01, verbose=0, CrossVali_random_state=1, CrossVali_n_splits=2, CrossVali_max_depth_list=[1], CrossVali_n_estimators_list=[100, 250, 500, 750, 1000], CrossVali_verbose=2): if TrainMethod == "CrossValidateDepth": gbes_shallow = ensemble.GradientBoostingClassifier( n_estimators=n_estimators, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, tol=tol, random_state=random_state, learning_rate=learning_rate) param_grid = { 'max_depth': CrossVali_max_depth_list } #[1,2,3,4,5,6,7,8,9,10] scorers = {'accuracy_score': make_scorer(accuracy_score)} refit_score = 'accuracy_score' skf = StratifiedKFold(n_splits=CrossVali_n_splits, random_state=CrossVali_random_state) grid_searchshallow = GridSearchCV(gbes_shallow, param_grid, scoring=scorers, refit=refit_score, cv=skf, return_train_score=True, n_jobs=n_jobs, verbose=CrossVali_verbose) grid_searchshallow.fit(X_train, y_train) if verbose == 1: print("print(grid_searchshallow.best_params_)=", grid_searchshallow.best_params_) print("grid_searchv.score(X_train, y_train)=", grid_searchshallow.best_estimator_.score(X_train, y_train)) print("n_estimators =", grid_searchshallow.best_estimator_.n_estimators_) return grid_searchshallow if TrainMethod == "FixedDepth": gbes_deeptree = ensemble.GradientBoostingClassifier( n_estimators=n_estimators, max_depth=FixedDepth_max_depth, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, tol=tol, random_state=random_state, learning_rate=learning_rate) gbes_deeptree.fit(X_train, y_train) if verbose == 1: print("gbes_deeptree.score(X_train, y_train)=", gbes_deeptree.score(X_train, y_train)) print("n_estimators =", gbes_deeptree.n_estimators_) return gbes_deeptree if TrainMethod == "CrossValidateDepthAndNumIterations": gbes_shallowNumIters = ensemble.GradientBoostingClassifier( n_estimators=n_estimators, random_state=random_state, learning_rate=learning_rate) param_grid = { 'max_depth': CrossVali_max_depth_list, 'n_estimators': CrossVali_n_estimators_list } #[1,2,3,4,5,6,7,8,9,10] scorers = {'accuracy_score': make_scorer(accuracy_score)} refit_score = 'accuracy_score' skf = StratifiedKFold(n_splits=CrossVali_n_splits, random_state=CrossVali_random_state) gbes_searchshallowNumIters = GridSearchCV(gbes_shallowNumIters, param_grid, scoring=scorers, refit=refit_score, cv=skf, return_train_score=True, n_jobs=n_jobs, verbose=CrossVali_verbose) gbes_searchshallowNumIters.fit(X_train, y_train) if verbose == 1: print("print(gbes_searchshallowNumIters.best_params_)=", gbes_searchshallowNumIters.best_params_) print( "grid_searchvNumIters.score(X_train, y_train)=", gbes_searchshallowNumIters.best_estimator_.score( X_train, y_train)) print("n_estimators =", gbes_searchshallowNumIters.best_estimator_.n_estimators_) return gbes_searchshallowNumIters if TrainMethod == "GrowDeep": TotalNumWeakLearners = 0 gbes_grow = ensemble.GradientBoostingClassifier( n_estimators=1, max_depth=GrowDeep_init_depth, random_state=random_state, warm_start=True, learning_rate=learning_rate) gbes_grow.fit(X_train, y_train) if LossEarlyStop == "accuracy": NEWSCORE = gbes_grow.score(X_train, y_train) if LossEarlyStop == "logloss": y_pred = gbes_grow.predict_proba(X_train) NEWSCORE = 1 - log_loss(y_train, y_pred) TotalNumWeakLearners += 1 if TotalNumWeakLearners >= n_estimators or NEWSCORE == 1.0: print("NEWSCORE=100%") return gbes_grow ##fit with early stop## no_improvement_counter_EarlyStop = 0 DIFFSCORERUNSUM = 0 for iterNUM in range(GrowDeep_max_iterPerDepthNUM): #_ = gbes_grow.set_params(n_estimators=1, warm_start=True) # set warm_start and new params of trees gbes_grow.n_estimators += 1 _ = gbes_grow.fit(X_train, y_train) # fit additional trees to est TotalNumWeakLearners += 1 if TotalNumWeakLearners >= n_estimators or NEWSCORE == 1.0: print("NEWSCORE=100%") return gbes_grow OLDSCORE = NEWSCORE if LossEarlyStop == "accuracy": NEWSCORE = gbes_grow.score(X_train, y_train) if LossEarlyStop == "logloss": y_pred = gbes_grow.predict_proba(X_train) NEWSCORE = 1 - log_loss(y_train, y_pred) DIFFSCORE = NEWSCORE - OLDSCORE DIFFSCORERUNSUM += DIFFSCORE if verbose >= 2: print("NEWSCORE at each early stop=", NEWSCORE) print("DIFFSCORE at each early stop=", DIFFSCORE) print("DIFFSCORERUNSUM at each early stop=", DIFFSCORERUNSUM) if (DIFFSCORERUNSUM) > tol: no_improvement_counter_EarlyStop = 0 # reset this counter if there is improvement. DIFFSCORERUNSUM = 0 if (DIFFSCORERUNSUM) < tol: no_improvement_counter_EarlyStop += 1 if no_improvement_counter_EarlyStop == n_iter_no_change: break if verbose >= 1: print("n_estimators for depth" + str(1) + "=", gbes_grow.n_estimators_) print("TotalNumWeakLearners", TotalNumWeakLearners) print("NEWSCORE", NEWSCORE) ##fit with early stop## #if LossDepth=="accuracy": # NEWSCOREdepth=gbes_grow.score(X_train, y_train) #if LossDepth=="logloss": # y_pred=gbes_grow.predict_proba(X_train) # NEWSCOREdepth=1-log_loss(y_train, y_pred) NEWSCOREdepth = gbes_grow.score(X_train, y_train) #NEWSCORE if tolAdjust == 1 and (1 - NEWSCOREdepth) < tol: tol = tol / 2 if verbose >= 1: print("NEWSCOREdepth", NEWSCOREdepth) if NEWSCOREdepth == 1.0: return gbes_grow no_improvement_counter = 0 RetrainFLAG = 0 for depthNUM in range(GrowDeep_max_depthNUM): _ = gbes_grow.set_params( max_depth=depthNUM + 2, warm_start=True) # set warm_start and new params of trees ##fit with early stop## no_improvement_counter_EarlyStop = 0 DIFFSCORERUNSUM = 0 for iterNUM in range(GrowDeep_max_iterPerDepthNUM): #_ = gbes_grow.set_params(n_estimators=1, warm_start=True) # set warm_start and new params of trees gbes_grow.n_estimators += 1 _ = gbes_grow.fit(X_train, y_train) # fit additional trees to est TotalNumWeakLearners += 1 if TotalNumWeakLearners >= n_estimators or NEWSCORE == 1.0: print("NEWSCORE=100%") return gbes_grow OLDSCORE = NEWSCORE if LossEarlyStop == "accuracy": NEWSCORE = gbes_grow.score(X_train, y_train) if LossEarlyStop == "logloss": y_pred = gbes_grow.predict_proba(X_train) NEWSCORE = 1 - log_loss(y_train, y_pred) DIFFSCORE = NEWSCORE - OLDSCORE DIFFSCORERUNSUM += DIFFSCORE if (DIFFSCORERUNSUM) > tol: no_improvement_counter_EarlyStop = 0 # reset this counter if there is improvement. DIFFSCORERUNSUM = 0 if (DIFFSCORERUNSUM) < tol: no_improvement_counter_EarlyStop += 1 if no_improvement_counter_EarlyStop == n_iter_no_change: break if verbose >= 1: print("n_estimators for depth" + str(depthNUM + 2) + "=", gbes_grow.n_estimators_) print("TotalNumWeakLearners", TotalNumWeakLearners) print("NEWSCORE", NEWSCORE) ##fit with early stop## OLDSCOREdepth = NEWSCOREdepth #if LossDepth=="accuracy": # NEWSCOREdepth=gbes_grow.score(X_train, y_train) #if LossDepth=="logloss": # y_pred=gbes_grow.predict_proba(X_train) # NEWSCOREdepth=1-log_loss(y_train, y_pred) NEWSCOREdepth = gbes_grow.score(X_train, y_train) #NEWSCORE if tolAdjust == 1 and (1 - NEWSCOREdepth) < tol: tol = tol / 2 if verbose >= 1: print("NEWSCOREdepth", NEWSCOREdepth) if NEWSCOREdepth == 1.0: break DIFFSCOREdepth = NEWSCOREdepth - OLDSCOREdepth if DIFFSCOREdepth >= 0 and ( DIFFSCOREdepth) > GrowDeep_tol_no_improvement: no_improvement_counter = 0 # reset this counter if there is improvement. if DIFFSCOREdepth >= 0 and ( DIFFSCOREdepth) < GrowDeep_tol_no_improvement: no_improvement_counter += 1 if no_improvement_counter == GrowDeep_max_no_improvement: break if DIFFSCOREdepth < 0: Retrain_iter = depthNUM + 1 RetrainFLAG = 1 break return gbes_grow
print('准确率:', test_ada_score) # 交叉验证 scores = model_selection.cross_val_score(ada, X_train, y_train, cv=10) # 平均准确率: 0.7965240641711229 print('平均准确率:', scores.mean()) # 随机森林 rfc = ensemble.RandomForestClassifier(n_estimators=100, random_state=66) rfc = rfc.fit(X_train, y_train) rfc_score = rfc.score(X_test, y_test) # 随机森林的准确率: 0.8026905829596412 print('随机森林的准确率:', rfc_score) # 梯度提升 gbc = ensemble.GradientBoostingClassifier(random_state=30).fit( X_train, y_train) score = gbc.score(X_test, y_test) # 梯度提升的准确率: 0.8475336322869955 print('梯度提升的准确率:', score) # adaBoost与随机森林结合 rfc = ensemble.RandomForestClassifier(n_estimators=100, random_state=88, n_jobs=-1) ada_rfc = ensemble.AdaBoostClassifier(rfc, n_estimators=100).fit( X_train, y_train) score = ada_rfc.score(X_test, y_test) # adaBoost与随机森林结合准确率: 0.7757847533632287 print('adaBoost与随机森林结合准确率:', score)
from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # In[877]: # run a simple model params = { 'n_estimators': 20, 'max_leaf_nodes': 6, 'learning_rate': 0.1, 'random_state': 1, 'max_features': 21 } classifier = ensemble.GradientBoostingClassifier(**params) classifier.fit(X_train, y_train) # In[881]: # calculate AUC from sklearn.metrics import roc_auc_score from sklearn import ensemble from sklearn.externals import joblib roc_auc_score(Y, classifier.predict(X)) # In[ ]: # StandardScaler() : Scaling is used to give same weights to each variables so # that in our optimization problem will give us the best value instead of giving different values each time
yTest1 = test_region1[['Maintenance_flag']] test_region1 = test_region1.drop(test_region1[['Maintenance_flag']], axis=1) test_region1 = test_region1[[ 'Vibration', 'Engine_RPM', 'Speed_OBD', 'Ambient_air_temp', 'Speed_GPS', 'Vehicle_speed_sensor', 'Throttle_Pos_Manifold', 'Mass_Air_Flow_Rate' ]] train1 = yTrain1.loc[yTrain1['Maintenance_flag'] == 1] claimTrain1 = len(train1.Maintenance_flag) / len(yTrain1.Maintenance_flag) Y_target_train1 = yTrain1.Maintenance_flag Y_target_test1 = yTest1.Maintenance_flag gbm1 = ensemble.GradientBoostingClassifier(loss='deviance', criterion='mse', n_estimators=1500, max_leaf_nodes=5, verbose=1) fit_gbm1 = gbm1.fit(train_region1, yTrain1) Y_predict1 = fit_gbm1.predict(test_region1) Y_predProb1 = fit_gbm1.predict_proba(test_region1) Y_probab1 = pd.DataFrame({ 0: Y_predProb1[:, 0], 1: Y_predProb1[:, 1] }, index=Y_target_test1.index.copy()) # roc values fpr2, tpr2, threshold2 = metrics.roc_curve(Y_target_test1, Y_probab1[1])
features /= (fmaxs - fmins).reshape(1, -1) tock() print("Do some memory cleanup") del X del X_downsampled del X_specgram del patches # classifier training print("Classifier training") # clf = svm.LinearSVC(C=10e-3) # clf = ensemble.RandomForestClassifier(n_estimators=200, n_jobs=4, verbose=1) clf = ensemble.GradientBoostingClassifier(n_estimators=100, verbose=1) clf.fit(features, Y) tock() print("Further cleanup: no longer need training data") del features del Y # load test data print("Load test data") X_test = np.load(TEST_DATA_PATH).astype('float32') tock() # downsample
train_data_norm, test_data_norm = normalize(train_data, test_data) #take out the data size #add one extra column 1s at the beginning of the data train_data = train_data_norm test_data = test_data_norm iteration_list = [10, 30, 100, 300] train_err_list = [] test_err_list = [] for i in iteration_list: #training the logistic boosting logboost = ensemble.GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=i) logboost.fit(train_data, train_labels.values.ravel()) #predicting train_pred = logboost.predict(train_data) test_pred = logboost.predict(test_data) #evaluate the error train_error = 1 - accuracy_score(train_labels, train_pred) test_error = 1 - accuracy_score(test_labels, test_pred) if (i == 300): plt.plot(np.arange(i) + 1, logboost.train_score_) plt.xlabel('Iterations') plt.ylabel('Train loss') plt.show()
from sklearn.datasets import fetch_openml # Load data from https://www.openml.org/d/554 X, y = fetch_openml('mnist_784', version=1, return_X_y=True) X = X / 255. from sklearn import ensemble from sklearn.kernel_approximation import Nystroem Estimators = [50, 70] Learning_rates = [0.1, 0.15, 0.20] Max_depths = [1, 3] #Leafs = [1] #feature_map_nystroem = Nystroem() #data_transformed = feature_map_nystroem.fit_transform(X) X_train, X_test = X[:60000], X[60000:] y_train, y_test = y[:60000], y[60000:] for estimate in Estimators: for lrates in Learning_rates: for max_depth in Max_depths: clf = ensemble.GradientBoostingClassifier(n_estimators=estimate, learning_rate=lrates, max_depth=max_depth) clf.fit(X_train, y_train) print("ERROR RATE FOR No_Estimators: " + str(estimate) + ", Learning rate: " + str(lrates) + ", Maximum depth: " + str(max_depth) + " is") y_pred = clf.predict(X_test) from sklearn import metrics print((1 - metrics.accuracy_score(y_test, y_pred)) * 100)
if score == float(1): sigtest.append(entry) sigtestscore.append(1.) elif score == float(0): bkgtest.append(entry) bkgtestscore.append(0.) print time.asctime(time.localtime()), "Datasets produced!" print time.asctime(time.localtime()), "Training BDT" #Train the BDT (Gradient Boosting Classifier) and save clf = ensemble.GradientBoostingClassifier(max_depth=8, n_estimators=100, learning_rate=0.008) clf.fit(full, fullscore) joblib.dump(clf, '/nfs/astrop/d6/rstein/BDTpickle/DCpixelclassifier.pkl') print time.asctime(time.localtime()), "BDT Trained" print "Score on whole training sample is", clf.score(full, fullscore) print "Score on whole test sample is", clf.score(fulltest, fulltestscore) print "Score on training signal is ", clf.score(sig, sigscore) print "Score on test signal is ", clf.score(sigtest, sigtestscore) print "Score on training background is ", clf.score(bkg, bkgscore) print "Score on test background is ", clf.score(bkgtest, bkgtestscore) importances = clf.feature_importances_
data_Y, test_size=0.33, random_state=42) ######################################################################## ######################################################################## ######################################################################## params = { 'n_estimators': 10, 'max_depth': 3, 'subsample': 0.5, 'learning_rate': 0.89, 'min_samples_leaf': 1, 'random_state': 5 } clf1 = ensemble.GradientBoostingClassifier(**params) clf2 = BernoulliNB() clf3 = DecisionTreeClassifier(random_state=0) clf4 = svm.SVC(kernel='rbf', probability=True) clf5 = SGDClassifier(loss="modified_huber", penalty='l1') clf6 = RandomForestClassifier(n_estimators=9) clf7 = ensemble.AdaBoostClassifier() clf8 = svm.SVC(kernel='linear', probability=True) clf9 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(150, 50, 15, 5, 3), random_state=1) clf10 = neighbors.KNeighborsClassifier(n_neighbors=5) clf11 = GaussianNB() clf12 = LinearDiscriminantAnalysis() clf13 = QuadraticDiscriminantAnalysis()
def gbdt(x, y): clf = ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=0).fit(x, y) return clf
feature_to_pick = 250 feature_top_n = get_top_n_features(titanic_train_data_X, titanic_train_data_Y, feature_to_pick) print('Total Feature:' + str(combined_train_test.shape)) print('Picked Feature' + str(feature_top_n.shape)) titanic_train_data_X = titanic_train_data_X[feature_top_n] del titanic_train_data_X['Ticket_Number'] titanic_test_data_X = titanic_test_data_X[feature_top_n] del titanic_test_data_X['Ticket_Number'] # 14.建立模型 rf_est = ensemble.RandomForestClassifier(n_estimators=750, criterion='gini', max_features='sqrt', max_depth=3, min_samples_split=4, min_samples_leaf=2, n_jobs=50, random_state=42, verbose=1) gbm_est = ensemble.GradientBoostingClassifier(n_estimators=900, learning_rate=0.0008, loss='exponential', min_samples_split=3, min_samples_leaf=2, max_features='sqrt', max_depth=3, random_state=42, verbose=1) et_est = ensemble.ExtraTreesClassifier(n_estimators=750, max_features='sqrt', max_depth=35, n_jobs=50, criterion='entropy', random_state=42, verbose=1) voting_est = ensemble.VotingClassifier(estimators=[('rf', rf_est), ('gbm', gbm_est), ('et', et_est)], voting='soft', weights=[3, 5, 2], n_jobs=50) voting_est.fit(titanic_train_data_X, titanic_train_data_Y) print('VotingClassifier Score:' + str(voting_est.score(titanic_train_data_X, titanic_train_data_Y))) print('VotingClassifier Estimators:' + str(voting_est.estimators_)) # 预测 titanic_test_data_X['Survived'] = voting_est.predict(titanic_test_data_X) submission = pd.DataFrame({'PassengerId': test_data_org.loc[:, 'PassengerId'],
featureSet = pd.DataFrame(columns=('url','no of dots','presence of hyphen','len of url','presence of at',\ 'presence of double slash','no of subdir','no of subdomain','len of domain','no of queries','is IP','presence of Suspicious_TLD',\ 'presence of suspicious domain','label')) for i in range(len(df)): features = getFeatures(df["URL"].loc[i], df["Lable"].loc[i]) featureSet.loc[i] = features featureSet.groupby(featureSet['label']).size() X = featureSet.drop(['url', 'label'], axis=1).values y = featureSet['label'].values model = { "DecisionTree": tree.DecisionTreeClassifier(max_depth=10), "RandomForest": ek.RandomForestClassifier(n_estimators=50), "Adaboost": ek.AdaBoostClassifier(n_estimators=50), "GradientBoosting": ek.GradientBoostingClassifier(n_estimators=50), "GNB": GaussianNB(), "LogisticRegression": LogisticRegression() } X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) results = {} for algo in model: clf = model[algo] clf.fit(X_train, y_train) score = clf.score(X_test, y_test) results[algo] = score winner = max(results, key=results.get) clf = model[winner]
pred_instance_name_arr.append([row[0].strip()]) pred_data_instance_names = vstack(pred_instance_name_arr) pred_data_features = vstack(pred_instance_feature_arr) # print(pred_data_instance_names) # print(pred_data_features) # --------------- train_data = vstack(data_arr) instance_name = vstack(instance_name_arr) instance_class = vstack(instance_class_arr).ravel() # print(instance_name) # print(instance_class) # Create a gradient boost classifier object gboostT = ensemble.GradientBoostingClassifier() # Evaluate model with cross-validation cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3) n_scores = cross_val_score(gboostT, train_data, instance_class, scoring='accuracy', cv=cv, error_score='raise') print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores))) # Create a gradient boost classifier object gboostT = ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=12) # Train and measure model performance score = gboostT.fit(train_data, instance_class).score(train_data, instance_class) print("Score: ", score) """ score = gboostT.fit(train_data, instance_class) # predict hte response for data2
def ecologyGBM(ip, port): #Log.info("Importing ecology_model.csv data...\n") ecology_train = h2o.import_file( path=h2o.locate("smalldata/gbm_test/ecology_model.csv")) #Log.info("Summary of the ecology data from h2o: \n") #ecology.summary() # Log.info("==============================") # Log.info("H2O GBM Params: ") # Log.info("x = ecology_train[2:14]") # Log.info("y = ecology_train["Angaus"]") # Log.info("ntrees = 100") # Log.info("max_depth = 5") # Log.info("min_rows = 10") # Log.info("learn_rate = 0.1") # Log.info("==============================") # Log.info("==============================") # Log.info("scikit GBM Params: ") # Log.info("learning_rate=0.1") # Log.info("n_estimators=100") # Log.info("max_depth=5") # Log.info("min_samples_leaf = 10") # Log.info("n.minobsinnode = 10") # Log.info("max_features=None") # Log.info("==============================") ntrees = 100 max_depth = 5 min_rows = 10 learn_rate = 0.1 # Prepare data for scikit use trainData = np.genfromtxt( h2o.locate("smalldata/gbm_test/ecology_model.csv"), delimiter=',', dtype=None, names=("Site", "Angaus", "SegSumT", "SegTSeas", "SegLowFlow", "DSDist", "DSMaxSlope", "USAvgT", "USRainDays", "USSlope", "USNative", "DSDam", "Method", "LocSed"), skip_header=1, missing_values=('NA'), filling_values=(np.nan)) trainDataResponse = trainData["Angaus"] trainDataFeatures = trainData[[ "SegSumT", "SegTSeas", "SegLowFlow", "DSDist", "DSMaxSlope", "USAvgT", "USRainDays", "USSlope", "USNative", "DSDam", "Method", "LocSed" ]] ecology_train["Angaus"] = ecology_train["Angaus"].asfactor() # Train H2O GBM Model: gbm_h2o = h2o.gbm(x=ecology_train[2:], y=ecology_train["Angaus"], ntrees=ntrees, learn_rate=learn_rate, max_depth=max_depth, min_rows=min_rows, distribution="bernoulli") # Train scikit GBM Model: gbm_sci = ensemble.GradientBoostingClassifier(learning_rate=learn_rate, n_estimators=ntrees, max_depth=max_depth, min_samples_leaf=min_rows, max_features=None) gbm_sci.fit(trainDataFeatures[:, np.newaxis], trainDataResponse) # Evaluate the trained models on test data # Load the test data (h2o) ecology_test = h2o.import_file( path=h2o.locate("smalldata/gbm_test/ecology_eval.csv")) # Load the test data (scikit) testData = np.genfromtxt(h2o.locate("smalldata/gbm_test/ecology_eval.csv"), delimiter=',', dtype=None, names=("Angaus", "SegSumT", "SegTSeas", "SegLowFlow", "DSDist", "DSMaxSlope", "USAvgT", "USRainDays", "USSlope", "USNative", "DSDam", "Method", "LocSed"), skip_header=1, missing_values=('NA'), filling_values=(np.nan)) testDataResponse = testData["Angaus"] testDataFeatures = testData[[ "SegSumT", "SegTSeas", "SegLowFlow", "DSDist", "DSMaxSlope", "USAvgT", "USRainDays", "USSlope", "USNative", "DSDam", "Method", "LocSed" ]] # Score on the test data and compare results # scikit auc_sci = roc_auc_score( testDataResponse, gbm_sci.predict_proba(testDataFeatures[:, np.newaxis])[:, 1]) # h2o gbm_perf = gbm_h2o.model_performance(ecology_test) auc_h2o = gbm_perf.auc() #Log.info(paste("scikit AUC:", auc_sci, "\tH2O AUC:", auc_h2o)) assert auc_h2o >= auc_sci, "h2o (auc) performance degradation, with respect to scikit"
all_models = { "1Dummy:Majority": dummy.DummyClassifier(strategy='most_frequent'), "1Dummy:Stratified": dummy.DummyClassifier(random_state=0), "DT:default": tree.DecisionTreeClassifier( random_state=0 ), # TODO Let's start like that. We will configure later. ##"DT-": tree.DecisionTreeClassifier(), # TODO Let's start like that. We will configure later. "RF:default": ensemble.RandomForestClassifier(n_estimators=10, random_state=0), "GBT:default": ensemble.GradientBoostingClassifier(random_state=0), "LR:default": linear_model.LogisticRegression(solver='liblinear', multi_class='ovr', random_state=0), } # ==================== GRID SEARCH ==================================== # ======== Logistic regression regularization_pars = (1e-6, 3e-6, 6e-6, 1e-5, 3e-5, 6e-5, 1e-4, 3e-4, 6e-4, 1e-3, 3e-3, 6e-3, 1e-2, 3e-2, 6e-2, 1e-1, 3e-1, 6e-1, 1, 3, 6, 10, 30, 60, 100, 300, 600) for penalty in ('l2', 'l1'): for reg in regularization_pars: all_models["LR:"+penalty+"-"+str(reg)] = \ linear_model.LogisticRegression(solver='liblinear', multi_class='ovr', random_state=0, C=reg,
pred = [most_common(x) for x in zip(*res)] f = open('predictions.csv', 'w') f.write("ID,Category\n") for i, res in enumerate(pred): f.write("%d,%d\n" % (i + 1, res)) f.close() train = np.load('train.npy') # Remove the labels test = np.load('test_distribute.npy')[:, 1:] data = train[:, 1:] target = train[:, 0] clfs = [] # Through cv testing, I found the optimal number of estimators to be 15 clfs.append(ensemble.ExtraTreesClassifier(n_estimators=100)) clfs.append(ensemble.GradientBoostingClassifier(n_estimators=125)) clfs.append(ensemble.AdaBoostClassifier(n_estimators=100)) predictificate(data, target, test, clfs) # I use the following code to find good hyperparameter values #scores = cross_validation.cross_val_score( #clf, data, target, cv=5) #print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))