def CheckAccuracy(x, g_z, data_cols, label_cols=[], seed=0, with_class=False, data_dim=2): import pandas as pd from sklearn import svm from sklearn.ensemble import RandomForestClassifier as rm from sklearn.ensemble import GradientBoostingClassifier as gbm dtrain = np.vstack([ x[:int(len(x) / 2)], g_z[:int(len(g_z) / 2)] ]) # Use half of each real and generated set for training dlabels = np.hstack( [np.zeros(int(len(x) / 2)), np.ones(int(len(g_z) / 2))]) # synthetic labels dtest = np.vstack([x[int(len(x) / 2):], g_z[int(len(g_z) / 2):] ]) # Use the other half of each set for testing y_true = dlabels # Labels for test samples will be the same as the labels for training samples, assuming even batch sizes clf = gbm() clf = clf.fit(dtrain, dlabels) y_pred = clf.predict(dtest) return SimpleAccuracy(y_pred, y_true)
def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0, metric='accuracy'): Classifier.__init__(self, warm_start=warm_start, metric=metric, random_state=random_state, verbose=verbose) self.gb = gbm(loss=loss, learning_rate=learning_rate, n_estimators=n_estimators, subsample=subsample, criterion=criterion, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf, max_depth=max_depth, min_impurity_decrease=min_impurity_decrease, min_impurity_split=min_impurity_split, init=init, random_state=random_state, max_features=max_features, verbose=verbose, max_leaf_nodes=max_leaf_nodes, warm_start=warm_start, validation_fraction=validation_fraction, n_iter_no_change=n_iter_no_change, tol=tol, ccp_alpha=ccp_alpha)
l["importance"] = model.feature_importances_ l.sort_values(by="importance",axis=0,inplace=True,ascending=ascending) return l def impplot(model,features,ascending=False,n=10): l = implist(model,features,ascending) lsub = l.iloc[0:n-1,].copy() lsub.sort_values(by="importance",axis=0,inplace=True,ascending=True) ax = lsub.plot(kind="barh",x="features",y="importance") ax.set_ylabel("Features") ax.set_xlabel("Gini Importance") ax.set_title("Top " + str(n) + " Features") b1 = gbm(learning_rate = 0.1, n_estimators = 50, max_depth = 1, criterion = "mse", min_samples_leaf = 10, random_state=5) b1.fit(letters_train_X,letters_train_Y) implist(b1,letters_train_X.columns) impplot(b1,letters_train_X.columns) """ Variable 12 is by far the most important. The order of importances are similar to the one produced by R but not exact. It is unclear why they are not exact. The order of the variable importances does not change here in Python when the random_state variable changes. It does change for R when the seed is different. R may be measuring variable importance differently than Python (could be Gini vs Deviance) """ """ a. Build a cross-tabulation of the predicted and actual letters (a 26 X 26 confusion matrix).
# tail_strength=0.5, noise=1, shuffle=True, # coef=False, random_state=42) X = pd.DataFrame(X) y = pd.DataFrame(y) X.to_csv('X.csv') y.to_csv('y.csv') params = { 'max_depth': ('range', 'integer', [1, 100]), 'subsample': ('range', 'float', [0.25, 1]), 'min_samples_split': ('range', 'integer', [2, 100]), 'min_samples_leaf': ('range', 'integer', [1, 100]) } model = gbm(n_estimators=100, random_state=42) #mod = lm().fit(X, y) #mod.summary() cur_rsm = RSMSearchCV(model, params, cv_iterations=5, surface_metric='AUC', cv_args={ 'print_': False, 'metrics': ['AUC'] }, max_iterations=1000, interpolate_range=3, evaluate_every=5,
from sklearn.pipeline import Pipeline from deployment_pima.processing import preprocessors as preproc from sklearn.experimental import enable_iterative_imputer from sklearn.impute import IterativeImputer from sklearn.preprocessing import StandardScaler from sklearn.ensemble import GradientBoostingClassifier as gbm from deployment_pima.config import config import logging _logger = logging.getLogger(__name__) pima_pipeline = Pipeline([ ('nan_imputer', preproc.MissingNaNImputer(columns=config.IMPUTE_FEATURES)), ('iterative_imputer', IterativeImputer(max_iter=config.IMPUTE_ITER, random_state=config.SEED)), ('standard_scaler', StandardScaler()), ('gbm', gbm(**config.MODEL_HYP)) ])
def train(self, a, b): self.model = gbm().fit(a, b)
#estimando o parametro em 3 fold grid = GSCV(rfc(), p_rf) grid.fit(data_tr, classes_tr) #acuracia rf = rfc(n_estimators=grid.best_params_['n_estimators'], max_features=grid.best_params_['max_features']) rf.fit(data_tr_pca, classes_tr) acc = rf.score(data_te_pca, classes_te) acc_mean[3] += acc / 5 #Gradient Boosting Machine #estimando o parametro em 3 fold grid = GSCV(gbm(max_depth=5), p_gbm) grid.fit(data_tr, classes_tr) #acuracia gb = gbm(learning_rate=grid.best_params_['learning_rate'], n_estimators=grid.best_params_['n_estimators']) gb.fit(data_tr_pca, classes_tr) acc = gb.score(data_te_pca, classes_te) acc_mean[4] += acc / 5 for i in acc_mean: acc_mean[i] = np.round(acc_mean[i] * 100, 2) #Mostrando a acuracia de cada metodo print 'Acuracias de cada metodo:' print 'K-Nearest Neighbors - ' + str(acc_mean[0]) + ' %'
print("Accuracy on the training subset: {:.3f}".format(tree.score(X_train, y_train))) print("Accuracy on the test subset: {:.3f}".format(tree.score(X_test, y_test))) from sklearn.ensemble import GradientBoostingClassifier as gbm original_params = {'n_estimators': 50, 'random_state': 2} plt.figure() for label, color, setting in [('Depth 2, lr = 1.0', 'turquoise', {'learning_rate': 1.0, 'max_depth': 2}), ('Depth 4, lr = 1.0', 'cadetblue', {'learning_rate': 1.0, 'max_depth': 4}), ('Depth 6, lr = 1.0', 'blue', {'learning_rate': 1.0, 'max_depth': 6}), ('Depth 2, lr = 0.1', 'orange', {'learning_rate': 0.1, 'max_depth': 2}), ('Depth 4, lr = 0.1', 'red', {'learning_rate': 0.1, 'max_depth': 6}), ('Depth 6, lr = 0.1', 'purple', {'learning_rate': 0.1, 'max_depth': 6})]: params = dict(original_params) params.update(setting) clf = gbm(**params) clf.fit(X_train, y_train) # compute test set auc test_deviance = np.zeros((params['n_estimators'],), dtype=np.float64) for i, y_pred in enumerate(clf.staged_predict_proba(X_test)): test_deviance[i] = metrics.roc_auc_score(y_test, y_pred[:,1]) #print test auc plt.plot((np.arange(test_deviance.shape[0]) + 1), test_deviance, '-', color=color, label=label) plt.legend(loc='lower right') pyplot.ylim(0.90, 1.0) plt.xlabel('Boosting Iterations') pyplot.ylabel("validation auc") plt.figure(figsize=(12,12))
train_sub1['dep'] = [1 if x == 'Borehole or tubewell' else 2 if x== 'Unprotected dug well' else 3 if x== 'Protected dug well' else 4 if x == 'Protected spring' else 5 if x== 'Public tap or standpipe' else 6 if x == 'Rainwater' else 7 if x == 'Piped into public tap or basin' else 8 if x == 'Surface water' else 9 if x == 'Null' else 10 if x == 'Unprotected spring' else 11 for x in train_sub1.iloc[:,0] ] train_sub2 = train_sub1.drop(['Water Source Type'], axis = 1) train_sub2 train_sub3 = train_sub2.loc[:,('well','spring','borehole','tube','pump','rainwater','public','communal','pumps','protected', 'unprotected','gravity','dams','dam','stand','phe','none','spot','piped','boreholes','rain','ii','eg','surface','of','not', 'reservoir','elevated','roof','no','pw','other','handpump','kiosk','dep')] train_sub3 train_sub3.head(1000) GBM from sklearn.ensemble import GradientBoostingClassifier as gbm from sklearn.cross_validation import KFold clf = gbm(loss = 'deviance' , learning_rate= 0.01,n_estimators= 1000, subsample= 0.75,verbose= 1) clf.fit(train_sub3.drop('dep', axis = 1) ,train_sub3['dep'] ) print("Score: ", clf.score( train_sub3.drop('dep', axis = 1) ,train_sub3['dep'])) print("Feature importances: ", clf.feature_importances_) Classification import numpy as np import pandas as pd rain data train = pd.read_csv("/Users/ankur/Documents/Competitions/Top coder2/train.csv") train.head() train.shape train.iloc[:,3].unique() train.iloc[:,3].value_counts()