def CheckAccuracy(x,
                  g_z,
                  data_cols,
                  label_cols=[],
                  seed=0,
                  with_class=False,
                  data_dim=2):
    import pandas as pd
    from sklearn import svm
    from sklearn.ensemble import RandomForestClassifier as rm
    from sklearn.ensemble import GradientBoostingClassifier as gbm

    dtrain = np.vstack([
        x[:int(len(x) / 2)], g_z[:int(len(g_z) / 2)]
    ])  # Use half of each real and generated set for training
    dlabels = np.hstack(
        [np.zeros(int(len(x) / 2)),
         np.ones(int(len(g_z) / 2))])  # synthetic labels
    dtest = np.vstack([x[int(len(x) / 2):], g_z[int(len(g_z) / 2):]
                       ])  # Use the other half of each set for testing
    y_true = dlabels  # Labels for test samples will be the same as the labels for training samples, assuming even batch sizes

    clf = gbm()
    clf = clf.fit(dtrain, dlabels)
    y_pred = clf.predict(dtest)
    return SimpleAccuracy(y_pred, y_true)
Exemple #2
0
	def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100,
				subsample=1.0, criterion='friedman_mse', min_samples_split=2,
				min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3,
				min_impurity_decrease=0.0, min_impurity_split=None, init=None,
				random_state=None, max_features=None, verbose=0, max_leaf_nodes=None,
				warm_start=False, validation_fraction=0.1, n_iter_no_change=None,
				tol=0.0001, ccp_alpha=0.0, metric='accuracy'):
		Classifier.__init__(self, warm_start=warm_start, metric=metric,
							random_state=random_state, verbose=verbose)
		self.gb = gbm(loss=loss, learning_rate=learning_rate, n_estimators=n_estimators,
					subsample=subsample, criterion=criterion, min_samples_split=min_samples_split,
					min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf,
					max_depth=max_depth, min_impurity_decrease=min_impurity_decrease,
					min_impurity_split=min_impurity_split, init=init, random_state=random_state,
					max_features=max_features, verbose=verbose, max_leaf_nodes=max_leaf_nodes,
					warm_start=warm_start, validation_fraction=validation_fraction,
					n_iter_no_change=n_iter_no_change, tol=tol, ccp_alpha=ccp_alpha)
    l["importance"] = model.feature_importances_
    l.sort_values(by="importance",axis=0,inplace=True,ascending=ascending)
    
    
    return l

def impplot(model,features,ascending=False,n=10):
    l = implist(model,features,ascending)
    lsub = l.iloc[0:n-1,].copy()
    lsub.sort_values(by="importance",axis=0,inplace=True,ascending=True)
    ax = lsub.plot(kind="barh",x="features",y="importance")
    ax.set_ylabel("Features")
    ax.set_xlabel("Gini Importance")
    ax.set_title("Top " + str(n) + " Features")
    
b1 = gbm(learning_rate = 0.1, n_estimators = 50, max_depth = 1, criterion = "mse", min_samples_leaf = 10, random_state=5)
b1.fit(letters_train_X,letters_train_Y)
    
implist(b1,letters_train_X.columns)
impplot(b1,letters_train_X.columns)

"""
Variable 12 is by far the most important. The order of importances are similar to the one produced by R but not exact.
It is unclear why they are not exact. The order of the variable importances does not change here in Python when the random_state
variable changes. It does change for R when the seed is different. 

R may be measuring variable importance differently than Python (could be Gini vs Deviance)
"""

"""
a. Build a cross-tabulation of the predicted and actual letters (a 26 X 26 confusion matrix).
#                        tail_strength=0.5, noise=1, shuffle=True,
#                        coef=False, random_state=42)
X = pd.DataFrame(X)
y = pd.DataFrame(y)

X.to_csv('X.csv')
y.to_csv('y.csv')

params = {
    'max_depth': ('range', 'integer', [1, 100]),
    'subsample': ('range', 'float', [0.25, 1]),
    'min_samples_split': ('range', 'integer', [2, 100]),
    'min_samples_leaf': ('range', 'integer', [1, 100])
}

model = gbm(n_estimators=100, random_state=42)

#mod = lm().fit(X, y)
#mod.summary()

cur_rsm = RSMSearchCV(model,
                      params,
                      cv_iterations=5,
                      surface_metric='AUC',
                      cv_args={
                          'print_': False,
                          'metrics': ['AUC']
                      },
                      max_iterations=1000,
                      interpolate_range=3,
                      evaluate_every=5,
from sklearn.pipeline import Pipeline
from deployment_pima.processing import preprocessors as preproc
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier as gbm
from deployment_pima.config import config

import logging
_logger = logging.getLogger(__name__)

pima_pipeline = Pipeline([
    ('nan_imputer', preproc.MissingNaNImputer(columns=config.IMPUTE_FEATURES)),
    ('iterative_imputer',
     IterativeImputer(max_iter=config.IMPUTE_ITER, random_state=config.SEED)),
    ('standard_scaler', StandardScaler()), ('gbm', gbm(**config.MODEL_HYP))
])
Exemple #6
0
 def train(self, a, b):
     self.model = gbm().fit(a, b)
Exemple #7
0
    #estimando o parametro em 3 fold
    grid = GSCV(rfc(), p_rf)
    grid.fit(data_tr, classes_tr)

    #acuracia
    rf = rfc(n_estimators=grid.best_params_['n_estimators'],
             max_features=grid.best_params_['max_features'])
    rf.fit(data_tr_pca, classes_tr)
    acc = rf.score(data_te_pca, classes_te)
    acc_mean[3] += acc / 5

    #Gradient Boosting Machine

    #estimando o parametro em 3 fold
    grid = GSCV(gbm(max_depth=5), p_gbm)
    grid.fit(data_tr, classes_tr)

    #acuracia
    gb = gbm(learning_rate=grid.best_params_['learning_rate'],
             n_estimators=grid.best_params_['n_estimators'])
    gb.fit(data_tr_pca, classes_tr)
    acc = gb.score(data_te_pca, classes_te)
    acc_mean[4] += acc / 5

for i in acc_mean:
    acc_mean[i] = np.round(acc_mean[i] * 100, 2)

#Mostrando a acuracia de cada metodo
print 'Acuracias de cada metodo:'
print 'K-Nearest Neighbors - ' + str(acc_mean[0]) + ' %'
print("Accuracy on the training subset: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on the test subset: {:.3f}".format(tree.score(X_test, y_test)))

from sklearn.ensemble import GradientBoostingClassifier as gbm
original_params = {'n_estimators': 50, 'random_state': 2}

plt.figure()
for label, color, setting in [('Depth 2, lr = 1.0', 'turquoise', {'learning_rate': 1.0, 'max_depth': 2}),
                              ('Depth 4, lr = 1.0', 'cadetblue',      {'learning_rate': 1.0, 'max_depth': 4}),
                              ('Depth 6, lr = 1.0', 'blue',      {'learning_rate': 1.0, 'max_depth': 6}),
                              ('Depth 2, lr = 0.1', 'orange',    {'learning_rate': 0.1, 'max_depth': 2}),
                              ('Depth 4, lr = 0.1', 'red',    {'learning_rate': 0.1, 'max_depth': 6}),
                              ('Depth 6, lr = 0.1', 'purple',      {'learning_rate': 0.1, 'max_depth': 6})]:
    params = dict(original_params)
    params.update(setting)
    clf = gbm(**params)
    clf.fit(X_train, y_train)

    # compute test set auc
    test_deviance = np.zeros((params['n_estimators'],), dtype=np.float64)
    for i, y_pred in enumerate(clf.staged_predict_proba(X_test)):
        test_deviance[i] = metrics.roc_auc_score(y_test, y_pred[:,1])
    #print test auc
    plt.plot((np.arange(test_deviance.shape[0]) + 1), test_deviance,
            '-', color=color, label=label)

plt.legend(loc='lower right')
pyplot.ylim(0.90, 1.0)
plt.xlabel('Boosting Iterations')
pyplot.ylabel("validation auc")
plt.figure(figsize=(12,12))
train_sub1['dep'] = [1 if x == 'Borehole or tubewell' else 2 if x== 'Unprotected dug well' else 3 if x== 'Protected dug well' else 4 if x == 'Protected spring' else 5 if x== 'Public tap or standpipe' else 6 if x == 'Rainwater' else 7 if x == 'Piped into public tap or basin' else 8 if x == 'Surface water' else 9 if x == 'Null' else 10 if x == 'Unprotected spring' else 11 for x in train_sub1.iloc[:,0] ]

train_sub2 = train_sub1.drop(['Water Source Type'], axis = 1) train_sub2

train_sub3 = train_sub2.loc[:,('well','spring','borehole','tube','pump','rainwater','public','communal','pumps','protected', 'unprotected','gravity','dams','dam','stand','phe','none','spot','piped','boreholes','rain','ii','eg','surface','of','not', 'reservoir','elevated','roof','no','pw','other','handpump','kiosk','dep')]

train_sub3

train_sub3.head(1000)

GBM

from sklearn.ensemble import GradientBoostingClassifier as gbm from sklearn.cross_validation import KFold

clf = gbm(loss = 'deviance' , learning_rate= 0.01,n_estimators= 1000, subsample= 0.75,verbose= 1)

clf.fit(train_sub3.drop('dep', axis = 1) ,train_sub3['dep'] )

print("Score: ", clf.score( train_sub3.drop('dep', axis = 1) ,train_sub3['dep'])) print("Feature importances: ", clf.feature_importances_)

Classification

import numpy as np import pandas as pd

rain data

train = pd.read_csv("/Users/ankur/Documents/Competitions/Top coder2/train.csv")

train.head() train.shape train.iloc[:,3].unique() train.iloc[:,3].value_counts()