Beispiel #1
0
def gradBosting_parameterTuning(train_dog,label_dog,train_cat,label_cat):
    #tuning the n_estimators and learning_rate first
    seed=123
    estimators_chosen=[800,1000]
    learning_chosen=[0.01,0.05]
    min_samples_split=[50,80,100]
    subsample=[0.4,0.6,0.8]
    max_depth=[8,10,12]
    min_score=np.inf
    best_params_dog={'n_estimators':0,'learning_rate':0,'min_samples_split':0,'subsample':0,'max_depth':0}

    gbc=GradientBoostingClassifier(max_features='sqrt',max_depth=6,min_samples_split=50,subsample=0.8)
    #tuning the estimators number and learning rate for dog
    for i in estimators_chosen:
        for j in learning_chosen:
            for z in min_samples_split:
                for m in subsample:
                    for n in max_depth:
                        gbc.set_params(n_estimators=i,learning_rate=j,min_samples_split=z,subsample=m,max_depth=n)
                        kfold=KFold(n_splits=10,random_state=seed)
                        score=cross_val_score(gbc,X=train_dog,y=label_dog,scoring='neg_log_loss',cv=kfold)
                        score=(-score.mean())
                        print('For dog dataset:')
                        print('The n_estimators=%d, the learning_rate=%.3f, the min_samples_split=%d, the subsample=%.2f,the max_depth=%d\
                             give the score=%f'%(i,j,z,m,n,score))
                        if score<min_score:
                            min_score=score
                            best_params_dog['n_estimators']=i
                            best_params_dog['learning_rate']=j
                            best_params_dog['min_samples_split']=z
                            best_params_dog['subsample']=m
                            best_params_dog['max_depth']=n
        print('Best params: {} {} {} {} {}, score: {}'.format(best_params_dog['n_estimators'], best_params_dog['learning_rate'],
                                                              best_params_dog['min_samples_split'],best_params_dog['subsample'],
                                                              best_params_dog['max_depth'],min_score))
    #tunning the estimators number and learning rate for cat
    min_score = np.inf
    best_params_cat = {'n_estimators': 0, 'learning_rate': 0, 'min_samples_split': 0, 'subsample': 0, 'max_depth': 0}
    for i in estimators_chosen:
        for j in learning_chosen:
            for z in min_samples_split:
                for m in subsample:
                    for n in max_depth:
                        gbc.set_params(n_estimators=i,learning_rate=j,min_samples_split=z,subsample=m,max_depth=n)
                        kfold=KFold(n_splits=10,random_state=seed)
                        score=cross_val_score(gbc,X=train_cat,y=label_cat,scoring='neg_log_loss',cv=kfold)
                        score=(-score.mean())
                        print('For cat dataset:')
                        print('The n_estimators=%d, the learning_rate=%.3f, the min_samples_split=%d, the subsample=%.2f,the max_depth=%d\
                              give the score=%f'%(i,j,z,m,n,score))
                        if score<min_score:
                            min_score=score
                            best_params_cat['n_estimators']=i
                            best_params_cat['learning_rate']=j
                            best_params_cat['min_samples_split']=z
                            best_params_cat['subsample']=m
                            best_params_cat['max_depth']=n
        print('Best params: {} {} {} {} {}, score: {}'.format(best_params_cat['n_estimators'], best_params_cat['learning_rate'],
                                                              best_params_cat['min_samples_split'],best_params_cat['subsample'],
                                                              best_params_cat['max_depth'],min_score))
    def __init__(self, estimator,
                 phase, 
                 n_jobs, cv_k_fold, parameters,
                 X_train, y_train,
                 X_test, y_test):
        # estimator : ensemble学習器

        # cv : if train : get best parameter
        if phase == "train":
            clf = GradientBoostingClassifier()
            gscv = GridSearchCV(clf, parameters, 
                                verbose = 10, 
                                scoring = "f1",#scoring = "precision" or "recall"
                                n_jobs = n_jobs, cv = cv_k_fold)
            gscv.fit(X_train, y_train)
            self.best_params = gscv.best_params_
            
            clf.set_params(**gscv.best_params_)
            clf.fit(X_train, y_train)
            train_loss = clf.train_score_
            test_loss = np.empty(len(clf.estimators_))
            for i, pred in enumerate(clf.staged_predict(X_test)):
                test_loss[i] = clf.loss_(y_test, pred)
            plt.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test')
            plt.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train')
            plt.xlabel('the number of weak learner:Boosting Iterations')
            plt.ylabel('Loss')
            plt.legend(loc="best")
            plt.savefig("loss_cv.png")
            plt.close()

        estimator.set_params(**gscv.best_params_)
        self.estimator = estimator
        self.one_hot_encoding = None
def train_gb(x_train, y_train, x_test, y_test, x_val, y_val, gb_gridsearch):
    print('Training model gradient boosting with sklearn...')
    cls = GradientBoostingClassifier()
    if gb_gridsearch:
        print('Tuning parameters...')
        grid_params_gb = [{
            'learning_rate': [0.05],
            'n_estimators': [1000],
            'max_depth': [6],
            'subsample': [1],
            'min_samples_split': [2],
            'min_samples_leaf': [1],
            'max_features': ['sqrt'],
            'verbose': [1]
        }]
        gs_gb = GridSearchCV(estimator=cls,
                             param_grid=grid_params_gb,
                             scoring='f1_weighted',
                             cv=10,
                             verbose=10,
                             n_jobs=-1)
        gs_gb.fit(x_train, y_train)
        # Best params
        print('Best params: %s' % gs_gb.best_params_)
        # Best training data r2
        print('Best training accuracy: %.3f' % gs_gb.best_score_)
        model = gs_gb.best_estimator_
        #cls.set_params(**gs_gb.best_params_)
        #model = cls.fit(x_train, y_train)
    else:
        params_gb = {
            'learning_rate': 0.05,
            'n_estimators': 500,
            'max_depth': 3,
            'subsample': 1,
            'min_samples_split': 2,
            'min_samples_leaf': 1,
            'max_features': 'sqrt',
            'verbose': 2
        }
        cls.set_params(**params_gb)
        model = cls.fit(x_train, y_train)
    print(print(cls.get_params()))
    print('Test predictions with trained mode...')
    y_pred = model.predict(x_test)
    print('Train predictions with trained mode...')
    y_pred_t = model.predict(x_train)
    print('Validation predictions with trained mode...')
    y_pred_val = model.predict(x_val)
    print('Confussion matrix test:')
    print(confusion_matrix(y_test, y_pred))
    print('Confussion matrix validation:')
    print(confusion_matrix(y_val, y_pred_val))
    print('Prediction accuracy for test: %.3f ' %
          accuracy_score(y_test, y_pred))
    print('Prediction accuracy for train: %.3f ' %
          accuracy_score(y_train, y_pred_t))
    print('Prediction accuracy for validation: %.3f ' %
          accuracy_score(y_val, y_pred_val))
    return model
def trainModel(strModelName, dcModelParams, arrX_train, arrY_train):
    """
        Use the given model setting and data to train a model
    """
    # setup model
    model = None
    if (strModelName == 'GBRT'):
        model = GradientBoostingClassifier()
    elif (strModelName == 'decision_tree'):
        model = DecisionTreeClassifier()
    elif (strModelName == 'extra_trees'):
        model = ExtraTreesClassifier()
    elif (strModelName == 'random_forest'):
        model = RandomForestClassifier()
    elif (strModelName == 'SVM'):
        model = SVC()
    else:
        raise  KeyError("Unsupported model: %s" % strModelName)
        
    if(dcModelParams is not None):
        model.set_params(**dcModelParams)

    # train
    model.fit(arrX_train, arrY_train)
    
    return model
Beispiel #5
0
def gradient_boosting_classifier(X_train_res, X_test, y_train_res):
    clf = GradientBoostingClassifier()  # instance of adaboost classifier
    clf.set_params(learning_rate=1,
                   max_depth=3,
                   n_estimators=30,
                   min_samples_split=3)  # tuned gradient boosting
    #clf.set_params(n_estimators = 30,learning_rate = 1)
    gb_clf = clf.fit(X_train_res,
                     y_train_res)  # fitting model on sampled train data
    gb_predict = gb_clf.predict(X_test)  # predict on test data
    gb_acc = accuracy_score(y_test, gb_predict)  # accuracy score
    gb_kappa = cohen_kappa_score(
        y_test, gb_predict)  # cohen kappa score of cohen_kappa
    accuracy = cross_val_score(clf,
                               X_train_res,
                               y_train_res,
                               cv=10,
                               scoring='accuracy')  # 10-fold accuracy score
    f_score = cross_val_score(clf,
                              X_train_res,
                              y_train_res,
                              cv=10,
                              scoring='f1_micro')  # 10-fold f1-score
    gb_accuracy, gb_f_score = accuracy.mean(), f_score.mean(
    )  # f1 and accuracy mean score
    #print "accuracy and f_score are:  "
    return gb_accuracy, gb_f_score, gb_clf, gb_predict, gb_kappa  # return gb_accuracy, gb_f_score,gb_clf,gb_predict,gb_kappa
Beispiel #6
0
def getGradientBDTClassifier(options={}):
    """the standard BDT classifier based on Gradient Boosting"""

    bdt = GradientBoostingClassifier(n_estimators=120,
                                     learning_rate=0.13,
                                     max_depth=5,
                                     min_weight_fraction_leaf=0.01,
                                     random_state=0)
    bdt.set_params(**options)
    return bdt
Beispiel #7
0
def getGradBDT(options={}):
    """ Standard BDT classifier based on GradienBoosting"""
    bdt = GradientBoostingClassifier(n_estimators=20,
                                     learning_rate=0.08,
                                     max_depth=6,
                                     min_weight_fraction_leaf=0.08,
                                     random_state=0,
                                     verbose=4)
    bdt.set_params(**options)
    return bdt
Beispiel #8
0
    def __init__(self, estimator, phase, n_jobs, cv_k_fold, parameters,
                 X_train, y_train, X_test, y_test):
        # estimator : ensemble学習器

        # cv : if train : get best parameter
        if phase == "train":
            gscv = GridSearchCV(
                GradientBoostingClassifier(),
                parameters,
                verbose=10,
                scoring="f1",  #scoring = "precision" or "recall"
                n_jobs=n_jobs,
                cv=cv_k_fold)
            gscv.fit(X_train, y_train)
            best_params = gscv.best_params_
            print "[GBDT's Best Parameter]", gscv.best_params_

            clf = GradientBoostingClassifier()
            clf.set_params(**gscv.best_params_)
            del gscv
            clf.fit(X_train, y_train)
            train_loss = clf.train_score_
            test_loss = np.empty(len(clf.estimators_))
            for i, pred in enumerate(clf.staged_predict(X_test)):
                test_loss[i] = clf.loss_(y_test, pred)
            plt.plot(np.arange(len(clf.estimators_)) + 1,
                     test_loss,
                     label='Test')
            plt.plot(np.arange(len(clf.estimators_)) + 1,
                     train_loss,
                     label='Train')
            plt.xlabel('the number of weak learner:Boosting Iterations')
            plt.ylabel('Loss')
            plt.legend(loc="best")
            plt.savefig("loss_cv.png")
            plt.close()
        else:
            best_params = {
                'loss': ['deviance'],
                'learning_rate': [0.1],
                'max_depth': [2],
                'min_samples_leaf': [8],
                'max_features': [5],  #max_features must be in (0, n_features]
                'max_leaf_nodes': [20],
                'subsample': [0.1],
                'n_estimators': [100],
                'random_state': [0]
            }

        estimator.set_params(**best_params)
        self.estimator = estimator
        self.one_hot_encoding = None
    def GBC(self):
        print("*********** Gradient Boosting Classifier ***********")
        Model = GradientBoostingClassifier()
        param_grid = [{
            'loss': ['deviance', 'exponential'],
            'n_estimators':
            np.arange(10, 200, 5),  #[10, 40, 70, 80, 90, 100, 120, 140, 150],
            'learning_rate':
            np.arange(0, 1, 0.01),  #[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1],
            'subsample':
            np.arange(0.1, 1, 0.05),  #[0.1,0.3,0.5,0.7,0.9,1],
            'min_samples_split': [2, 4, 5, 7, 9, 10],
            'min_samples_leaf': [1, 2, 3, 4, 5],
            'max_depth': [int(x) for x in np.linspace(10, 110, num=11)],
            'max_features': ['auto', 'sqrt', 'log2']
        }]

        clf = RandomizedSearchCV(Model,
                                 param_distributions=param_grid,
                                 n_iter=5,
                                 scoring='roc_auc',
                                 n_jobs=-1,
                                 cv=10,
                                 verbose=3)
        best_clf = clf.fit(self.X, self.y)

        print(
            f'Accuracy during search - : {best_clf.score(self.X,self.y):.3f}')
        params = best_clf.best_estimator_.get_params()
        estimator = Model.set_params(**params)
        return estimator
    def __init__(self, estimator,
                 phase, 
                 n_jobs, cv_k_fold, parameters,
                 X_train, y_train,
                 X_test, y_test):
        # estimator : ensemble学習器

        # cv : if train : get best parameter
        if phase == "train":
            gscv = GridSearchCV(GradientBoostingClassifier(), 
                                parameters, 
                                verbose = 10, 
                                scoring = "f1",#scoring = "precision" or "recall"
                                n_jobs = n_jobs, cv = cv_k_fold)
            gscv.fit(X_train, y_train)
            best_params = gscv.best_params_
            print "[GBDT's Best Parameter]", gscv.best_params_
            
            clf = GradientBoostingClassifier()
            clf.set_params(**gscv.best_params_)
            del gscv
            clf.fit(X_train, y_train)
            train_loss = clf.train_score_
            test_loss = np.empty(len(clf.estimators_))
            for i, pred in enumerate(clf.staged_predict(X_test)):
                test_loss[i] = clf.loss_(y_test, pred)
            plt.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test')
            plt.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train')
            plt.xlabel('the number of weak learner:Boosting Iterations')
            plt.ylabel('Loss')
            plt.legend(loc="best")
            plt.savefig("loss_cv.png")
            plt.close()
        else:
            best_params = {'loss' : ['deviance'],
                           'learning_rate' : [0.1],
                           'max_depth': [2],
                           'min_samples_leaf': [8],
                           'max_features': [5],#max_features must be in (0, n_features]
                           'max_leaf_nodes' : [20],
                           'subsample' : [0.1],
                           'n_estimators' : [100],
                           'random_state' : [0]}
            
        estimator.set_params(**best_params)
        self.estimator = estimator
        self.one_hot_encoding = None
Beispiel #11
0
def grid_search(gbdt_model, param_search, dtrain):
    gbdt_gs = GradientBoostingClassifier(learning_rate=0.1,
                                         n_estimators=100,
                                         subsample=1.0,
                                         min_samples_split=2,
                                         min_samples_leaf=1,
                                         max_depth=3,
                                         max_features='sqrt')
    param_set = gbdt_model.get_params()
    gbdt_gs.set_params(**param_set)

    gsearch = GridSearchCV(estimator=gbdt_gs, param_grid=param_search, cv=5)
    gsearch.fit(dtrain.values[:, 1:], dtrain.values[:, 0])
    print(gsearch.cv_results_)
    print(gsearch.best_score_)
    print(gsearch.best_params_)
    return gsearch.best_params_
Beispiel #12
0
    def _build_ml_model(self, param_model=None):
        if self.ml_model == "gbm":
            model = GradientBoostingClassifier()
        elif self.ml_model == "adaboost":
            model = AdaBoostClassifier()
        elif self.ml_model == "rf":
            model = RandomForestClassifier()
        elif self.ml_model == "svc":
            model = SVC()
        else:
            raise ValueError(
                "Please use either one of the following values 'gbm', 'adaboost', 'rf', 'svc'"
            )

        if param_model is not None:
            model.set_params(param_model)
        return model
Beispiel #13
0
def model_fit(dtrain, param_set=None):
    gbdt_model = GradientBoostingClassifier(learning_rate=0.1,
                                            n_estimators=100,
                                            subsample=1.0,
                                            min_samples_split=2,
                                            min_samples_leaf=1,
                                            max_depth=3,
                                            max_features='sqrt')
    if param_set:
        gbdt_model.set_params(**param_set)

    gbdt_model.fit(dtrain.values[:, 1:], dtrain.values[:, 0])
    dtrain_pred = gbdt_model.predict(dtrain.values[:, 1:])
    print(gbdt_model.feature_importances_)
    print('准确率 : %.4g' %
          metrics.accuracy_score(dtrain.values[:, 0], dtrain_pred))
    return gbdt_model
Beispiel #14
0
def gb_paramsearch(DEPTH, COLUMNS, COLNAME, START=0, ENDIX=11):
    gbt = pd.DataFrame({
        "trees": range(START, 1600),
        "columns": COLNAME,
        "depth": DEPTH,
        "unoC_train": 0,
        "acc_train": 0,
        "aucPR_train": 0,
        "aucROC_train": 0,
        "unoC_val": 0,
        "acc_val": 0,
        "aucPR_val": 0,
        "aucROC_val": 0
    })

    gb = GradientBoostingClassifier(random_state=0,
                                    verbose=True,
                                    min_samples_leaf=5,
                                    max_depth=DEPTH,
                                    n_estimators=START,
                                    subsample=1,
                                    learning_rate=0.1)
    if START > 0:
        print("pretraining!")
        gb.fit(train.loc[tra_ix, COLUMNS],
               train.loc[tra_ix, "AnyOutcome"],
               sample_weight=train.loc[tra_ix, "IPCW"])

    t0 = time.time()
    for i in range(1, ENDIX):
        if i % 10 == 0:
            print(i)
            print(time.time() - t0)
            t0 = time.time()

        _ = gb.set_params(n_estimators=START + 20 * i, warm_start=True)

        gb.fit(train.loc[tra_ix, COLUMNS],
               train.loc[tra_ix, "AnyOutcome"],
               sample_weight=train.loc[tra_ix, "IPCW"])
        print(gb.n_estimators_)

        for d in ["train", "val"]:
            gc.collect()
            ms = get_metrics(d, gb, COLUMNS)  #get_metrics(d, gb, COLUMNS)
            print(ms)
            for k, v in ms.items():
                gbt.loc[gbt.trees == gb.n_estimators_, k] = v

    gbt = gbt[~(gbt.unoC_val == 0)].reset_index(drop=True)
    previous = pd.read_csv("./Performance_Metrics/metric_df.csv")

    gbt = pd.concat([previous, gbt])
    print("Finished! We have this many rows in our data frame:", len(gbt))
    gbt.to_csv("./Performance_Metrics/metric_df.csv", index=False)
Beispiel #15
0
def runGradientBoostingClassifier(x_train, y_train, x_test, y_test, p):

    # Here we instantiate the gradient boosting classifier
    clf = GradientBoostingClassifier()
    clf.set_params(**p)

    clf.fit(x_train, y_train)

    # now we have to computer the classification accuracy
    # think about what two variables we have to compare
    gbc_predictions = clf.predict(x_test)
    dt_score = accuracy_score(y_test, gbc_predictions)
    print("accuracy score on test data: " + str(dt_score), file=sys.stderr)

    etc_predictions = clf.predict(x_test)
    dt_score = accuracy_score(y_test, etc_predictions)
    print("accuracy score on test data: " + str(dt_score), file=sys.stderr)
    train_score = accuracy_score(y_train, clf.predict(x_train))
    print("accuracy score on training data: " + str(train_score),
          file=sys.stderr)

    return (train_score, dt_score)
Beispiel #16
0
class Hyperopt_gbc:
    def __init__(self, X, y, seed):
        self.name = 'Gradient Boosting'
        self.name_short = 'GBC'
        self.X = X
        self.y = y
        self.seed = seed        
        self.clf = None
        self.best_acc = 0
        self.space = {
              'max_depth': hp.choice('max_depth', range(1, 30)),
              'max_features': hp.choice('max_features', range(1, 5)),
              'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 50)),
              'min_samples_split': hp.choice('min_samples_split', range(10, 50, 10)),
              'max_leaf_nodes': hp.choice('max_leaf_nodes', range(2, 50)),
              'loss': hp.choice('loss', ['deviance', 'exponential']),
              'n_estimators': hp.choice('n_estimators', range(1, 500, 5)),
              'subsample' : hp.quniform('subsample', 0.1, 1, 0.01),
              'learning_rate' : hp.quniform('learning_rate', 0.01, 0.5, 0.01)
               }
        self.max_evals = 50

    def train_test(self, params):
        warnings.filterwarnings(action='ignore', category=DeprecationWarning)
        self.clf = GradientBoostingClassifier(**params)
        self.clf.fit(self.X, self.y)
        return cross_val_score(self.clf, self.X, self.y, scoring='roc_auc', cv=10).mean()
    
    def f(self, params):
        acc = self.train_test(params)
        if acc > self.best_acc:
            self.best_acc = acc
        return {'loss': -acc, 'status': STATUS_OK}
    
    def best(self):
        trials = Trials()
        best = fmin(self.f, self.space, algo=tpe.suggest, max_evals = self.max_evals, rstate= np.random.RandomState(self.seed), trials=trials)
        self.clf.set_params(**best)
        return self.clf, self.name, self.name_short, space_eval(self.space, best), self.best_acc
Beispiel #17
0
    def __init__(self, estimator, phase, n_jobs, cv_k_fold, parameters,
                 X_train, y_train, X_test, y_test):
        # estimator : ensemble学習器

        # cv : if train : get best parameter
        if phase == "train":
            clf = GradientBoostingClassifier()
            gscv = GridSearchCV(
                clf,
                parameters,
                verbose=10,
                scoring="f1",  #scoring = "precision" or "recall"
                n_jobs=n_jobs,
                cv=cv_k_fold)
            gscv.fit(X_train, y_train)
            self.best_params = gscv.best_params_

            clf.set_params(**gscv.best_params_)
            clf.fit(X_train, y_train)
            train_loss = clf.train_score_
            test_loss = np.empty(len(clf.estimators_))
            for i, pred in enumerate(clf.staged_predict(X_test)):
                test_loss[i] = clf.loss_(y_test, pred)
            plt.plot(np.arange(len(clf.estimators_)) + 1,
                     test_loss,
                     label='Test')
            plt.plot(np.arange(len(clf.estimators_)) + 1,
                     train_loss,
                     label='Train')
            plt.xlabel('the number of weak learner:Boosting Iterations')
            plt.ylabel('Loss')
            plt.legend(loc="best")
            plt.savefig("loss_cv.png")
            plt.close()

        estimator.set_params(**gscv.best_params_)
        self.estimator = estimator
        self.one_hot_encoding = None
            xytext=(150, 1.0), textcoords='data', arrowprops=dict(arrowstyle="-", connectionstyle="arc"))
ax.annotate('', xy=(800, test_deviance[799]), xycoords='data',
            xytext=(800, est.train_score_[799]), textcoords='data', arrowprops=dict(arrowstyle="-"))
ax.text(810, 0.25, 'train-test gap')

def fmt_params(params):
    return ','.join("{0}={1}".format(key, val) for key, val in params.iteritems())

fig = plt.figure(figsize=(10, 10))
ax = plt.gca()
for params, (test_color, train_color) in [({}, ('#d7191c', '#2c7bb6')),
                                          # ({'min_samples_leaf': 3}, ('#fdae61', '#abd9e9')),
                                          ({'learning_rate': 0.1}, ('#bcbcbc', '#ccebc4')),
                                          ({'learning_rate': 0.1, 'subsample': 0.5}, ('#7A68A6', '#FFB5B8'))]:
    est = GradientBoostingRegressor(n_estimators=n_estimators, max_depth=1, learning_rate=1.0)
    est.set_params(**params)
    est.fit(x_train, y_train)

    ax, test_dev = deviance_plot(est, x_test, y_test, ax=ax, label=fmt_params(params), train_color=train_color, test_color=test_color)

ax.annotate('Higher bias', xy=(900, est.train_score_[899]), xycoords='data',
            xytext=(600, 0.3), textcoords='data', arrowprops=dict(arrowstyle="-", connectionstyle="arc"))
ax.annotate('Lower variance', xy=(900, test_deviance[899]), xycoords='data',
            xytext=(600, 1.0), textcoords='data', arrowprops=dict(arrowstyle="-"))

plt.legend(loc='upper right')

from sklearn.grid_search import GridSearchCV

param_grid = {
    'learning_rate': [0.1, 0.05, 0.02, 0.01],
def main():


    ########################## 
    # Dataset initialization # 
    ########################## 

    print('Dataset initialization')
    
    try :

        vectors = pickle.load(open(VECTORS_PATH, 'rb'))
        xs, ys = vectors['xs'], vectors['ys']

    except FileNotFoundError:

        xs, ys = vectorize(DATA_PATH, LABEL_PATH)
        pickle.dump({ 'xs': xs, 'ys': ys }, open(VECTORS_PATH, 'wb'))
        
    print('Class Distribution Bar Graph')
    class_dist_bar(LABEL_PATH)

    ##########################
    # Parameter Optimization #
    ##########################

    print('Parameter Optimization')
    max_depth = int(len(xs[1]) * .40) - 1
    single = int(max_depth/5)

    Random Forest Parameter Grid
    rfc_param_grid = [{
        'n_estimators': [i for i in range(100, 1100, 100)],
        'max_depth': [i for i in range(2, 22, 2)],
        # 'n_jobs': [NUM_CORES],
        'random_state': [RANDOM_STATE] 
    }]
    rfc_px_len = len(rfc_param_grid[0]['n_estimators'])
    rfc_py_len = len(rfc_param_grid[0]['max_depth'])

    # Gradient Boost Parameter Grid
    gbc_param_grid = [{
        'n_estimators': [i for i in range(100, 1100, 100)],
        'max_depth': [i for i in range(2, 22, 2)],
        'random_state': [RANDOM_STATE] 
    }]
    gbc_px_len = len(gbc_param_grid[0]['n_estimators'])
    gbc_py_len = len(gbc_param_grid[0]['max_depth'])
      
    # XGBoost Parameter Grid
    xgb_param_grid = [{
        'nthread': [NUM_CORES], 
        'objective': ['binary:logistic'],
        'learning_rate': [0.05], 
        'n_estimators': [i for i in range(100, 1200, 100)],
        'max_depth': [i for i in range(2, 22, 2)],
        'seed': [RANDOM_STATE_XGB]
    }]

    xgb_px_len = len(xgb_param_grid[0]['n_estimators'])
    xgb_py_len = len(xgb_param_grid[0]['max_depth'])
    
    Random Forest
    print('\tRandom Forest')

    try :

        rfc_results = pickle.load(open(RFC_GRID_SEARCH_PATH, 'rb'))
        param_selection_heat_map(rfc_results, rfc_px_len, rfc_py_len, GRID_SEARCH_CV_FOLDS, RFC_GRID_SEARCH_GRAPH_PATH, 'Random Forest Classifier Parameter Selection')

    except FileNotFoundError:

        rfc_results = optimize_hyper_params('rfc', rfc_param_grid, xs, ys)
        pickle.dump(rfc_results, open(RFC_GRID_SEARCH_PATH, 'wb'))
        param_selection_heat_map(rfc_results, rfc_px_len, rfc_py_len, GRID_SEARCH_CV_FOLDS, RFC_GRID_SEARCH_GRAPH_PATH, 'Random Forest Classifier Parameter Selection')

    # Gradient Boosted Trees 
      
    print('\tGradient Boosted Trees')

    try:

        gbc_results = pickle.load(open(GBC_GRID_SEARCH_PATH, 'rb')) 
        param_selection_heat_map(gbc_results, gbc_px_len, gbc_py_len, GRID_SEARCH_CV_FOLDS, GBC_GRID_SEARCH_GRAPH_PATH, 'Gradient Boosted Trees Parameter Selection')

    except FileNotFoundError:

        gbc_results = optimize_hyper_params('gbc', gbc_param_grid, xs, ys)
        pickle.dump(gbc_results, open(GBC_GRID_SEARCH_PATH, 'wb'))
        param_selection_heat_map(gbc_results, gbc_px_len, gbc_py_len, GRID_SEARCH_CV_FOLDS, GBC_GRID_SEARCH_GRAPH_PATH, 'Gradient Boosted Trees Parameter Selection')

    # XGBoost

    print('\tXGBoost')
    
    try:

        xgb_results = pickle.load(open(XGB_GRID_SEARCH_PATH, 'rb')) 
        param_selection_heat_map(xgb_results, xgb_px_len, xgb_py_len, GRID_SEARCH_CV_FOLDS, XGB_GRID_SEARCH_GRAPH_PATH, 'XGBoost Parameter Selection')

    except FileNotFoundError:

        xgb_results = optimize_hyper_params('xgb', xgb_param_grid, xs, ys)
        pickle.dump(xgb_results, open(XGB_GRID_SEARCH_PATH, 'wb'))
        param_selection_heat_map(xgb_results, xgb_px_len, xgb_py_len, GRID_SEARCH_CV_FOLDS, XGB_GRID_SEARCH_GRAPH_PATH, 'XGBoost Trees Parameter Selection')

    #################### 
    # Final Train/Test # 
    #################### 

    print('Final Train/Test')

    try:

        final_scores = pickle.load(open(FINAL_RESULTS_PATH, 'rb'))

    except FileNotFoundError:

        opt_params = { 
            'rfc': {
                1:  { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 
                2:  { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE },
                3:  { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 
                4:  { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE },
                5:  { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 
                6:  { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE },
                7:  { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 
                8:  { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE },
                9:  { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 
                10: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE },
                11: { 'n_estimators': 100, 'max_depth': 20, 'random_state': RANDOM_STATE }, 
                12: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE },
                13: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 
                14: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE },
                15: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE }, 
                16: { 'n_estimators': 100, 'max_depth': 16, 'random_state': RANDOM_STATE },
                17: { 'n_estimators': 500, 'max_depth': 10, 'random_state': RANDOM_STATE },
            },  
            'gbc': {
                1: { 'n_estimators': 100, 'max_depth': 2, 'random_state': RANDOM_STATE }, 
                2: { 'n_estimators': 100, 'max_depth': 2, 'random_state': RANDOM_STATE },
                3: { 'n_estimators': 600, 'max_depth': 4, 'random_state': RANDOM_STATE }, 
                4: { 'n_estimators': 100, 'max_depth': 2, 'random_state': RANDOM_STATE },
                5: { 'n_estimators': 100, 'max_depth': 10, 'random_state': RANDOM_STATE }, 
                6: { 'n_estimators': 100, 'max_depth': 8, 'random_state': RANDOM_STATE },
                7: { 'n_estimators': 100, 'max_depth': 4, 'random_state': RANDOM_STATE }, 
                8: { 'n_estimators': 200, 'max_depth': 12, 'random_state': RANDOM_STATE },
                9: { 'n_estimators': 100, 'max_depth': 8, 'random_state': RANDOM_STATE }, 
                10: { 'n_estimators': 100, 'max_depth': 10, 'random_state': RANDOM_STATE },
                11: { 'n_estimators': 900, 'max_depth': 4, 'random_state': RANDOM_STATE }, 
                12: { 'n_estimators': 200, 'max_depth': 2, 'random_state': RANDOM_STATE },
                13: { 'n_estimators': 100, 'max_depth': 2, 'random_state': RANDOM_STATE }, 
                14: { 'n_estimators': 300, 'max_depth': 4, 'random_state': RANDOM_STATE },
                15: { 'n_estimators': 100, 'max_depth': 2, 'random_state': RANDOM_STATE }, 
                16: { 'n_estimators': 300, 'max_depth': 8, 'random_state': RANDOM_STATE },
                17: { 'n_estimators': 100, 'max_depth': 10, 'random_state': RANDOM_STATE },
            }   
        }   

        final_scores = {}
        for i in range(1, 18):
            rfc = RandomForestClassifier()
            rfc.set_params(**opt_params['rfc'][i])
            gbc = GradientBoostingClassifier()
            gbc.set_params(**opt_params['gbc'][i])
            # xgb = XGBClassifier()
            final_scores[i] = {}
            final_scores[i]['rfc'] = cross_val_score(rfc, xs[i], ys[i], cv=CV_FOLDS, n_jobs=NUM_CORES, scoring='f1')
            final_scores[i]['gbc'] = cross_val_score(gbc, xs[i], ys[i], cv=CV_FOLDS, n_jobs=NUM_CORES, scoring='f1')
            # final_scores[i]['xgb'] = cross_val_score(xgb, np.array(xs[i]), np.array(ys[i]), cv=CV_FOLDS, scoring='f1')

        pickle.dump(final_scores, open(FINAL_RESULTS_PATH, 'wb'))

    fig, axarr = plt.subplots(5, 4, figsize=(25, 25))
    for i in range(1, 18):

        a, b = final_scores[i]['rfc'], final_scores[i]['gbc']
        # a, b, c = final_scores[i]['rfc'], final_scores[i]['gbc'], final_scores[i]['xgb']
        row, col = int((i-1)/4), (i-1)%4
        axarr[row][col].boxplot([a, b])
        # axarr[row][col].boxplot([a, b, c])
        axarr[row][col].set_title('Body Zone %s' % (i), fontsize=28)
        axarr[row][col].set_xticklabels(['RFC', 'GBC'], fontsize=24)
        # axarr[row][col].set_xticklabels(['RFC', 'GBC', 'XGB'])
        axarr[row][col].set_ylabel('Accuracy', fontsize=24)

    for i in range(1, 4): axarr[4][i].axis('off')
    plt.suptitle("Model Comparison", fontsize=30, fontweight='bold')
    plt.tight_layout()
    plt.subplots_adjust(top=.95)
    plt.savefig(FINAL_RESULTS_GRAPH_PATH)
voting_clf_soft = VotingClassifier(estimators=model_list, voting='soft')
voting_clf_soft.fit(X_train, y_train)
esm_score_val = voting_clf_soft.score(X_val, y_val)
esm_score_test = voting_clf_soft.score(X_test, y_test)
#print(esm_score_val)
#print(esm_score_test)
#tune_parameters = {'n_estimators' : [50, 100]}
#gbm_clf = GridSearchCV(estimator = GradientBoostingClassifier(max_depth=6, random_state=0), param_grid=tune_parameters)
#gbm_clf.fit(X_train, y_train)
trees = (10, 50, 100)
gbm_clf_final = GradientBoostingClassifier(max_depth=6)
training_errors = list()
validation_errors = list()
test_errors = list()
for tree in trees:
    gbm_clf_final.set_params(n_estimators=tree)
    gbm_clf_final.fit(X_train, y_train)
    training_errors.append(gbm_clf_final.score(X_train, y_train))
    validation_errors.append(gbm_clf_final.score(X_val, y_val))
    test_errors.append(gbm_clf_final.score(X_test, y_test))
plt.plot(trees, training_errors, label='Train')
plt.plot(trees, test_errors, label='Test')
plt.plot(trees, validation_errors, label='Validation')
plt.xlabel('No. of Trees')
plt.ylabel('Performance Score')
plt.legend(loc='upper left')
#gbm_clf1 = GradientBoostingClassifier(n_estimators=50, max_depth=6)
#gbm_clf1.fit(X_train,y_train)
#gbm_clf2 = GradientBoostingClassifier(n_estimators=100, max_depth=6)
#gbm_clf2.fit(X_train,y_train)
#gbm_clf3 = GradientBoostingClassifier(n_estimators=10, max_depth=6)
class AnalyzeBoost:
    """Analyzing the performance of three different boosting methods:
    - AdaBoost,
    - Gradient boost,
    - XGBoost.

    Parameters
    ----------
    X_train : array
      Features of the training set.
    X_test : array
      Features of the test set.
    y_train : array
      Targets of the training set.
    y_test : array
      Targets of the test set.
    method : str
        Boosting method to analyze.
    seed : float
        Random seed.
    n_estimators : int
    learning_rate : float
    max_depth : int
    verbose : boolean
        If True, printouts from the process are provided.

    Attributes
    ----------
    attribute : float
       Description.


    """
    def __init__(
            self,
            X_train,
            X_test,
            y_train,
            y_test,
            method="xgboost",
            seed=0,
            n_estimators=100,
            learning_rate=0.5,
            max_depth=3,
            verbose=True,
            time_id=time.strftime("%Y%m%d-%H%M&S"),
    ):

        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test

        self.method = method
        self.seed = seed
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.verbose = verbose
        self.time_id = time_id

        if self.verbose:
            fprint("-----------------------")
            fprint(f"Time: {self.time_id}")
            fprint(f"Number of training samples: {np.shape(self.X_train)[0]}")
            fprint(f"Number of test samples: {np.shape(self.X_test)[0]}")
            fprint(f"Method: {method}")

        if self.method == "adaboost":
            self.base_estimator = DecisionTreeClassifier()
            self.clf = AdaBoostClassifier(base_estimator=self.base_estimator)
            self.max_depth_str = "base_estimator__max_depth"
        elif self.method == "gradientboost":
            self.clf = GradientBoostingClassifier()
            self.max_depth_str = "max_depth"
        elif self.method == "xgboost":
            self.clf = xgb.XGBClassifier()
            self.max_depth_str = "max_depth"
        else:
            print("Provide boost method.")
            sys.exit(1)

    def fit(self):
        parameters = {
            "n_estimators": self.n_estimators,
            "learning_rate": self.learning_rate,
            self.max_depth_str: self.max_depth,
        }

        self.clf.set_params(**parameters)

        if self.verbose:
            fprint(f"Estimators: {self.n_estimators}")
            fprint(f"Learning rate: {self.learning_rate}")
            fprint(f"Max depth: {self.max_depth}")
            print("Making fit...")

        self.clf.fit(self.X_train, self.y_train)

        self.imp = self.clf.feature_importances_
        self.idcs = np.argsort(self.imp)
        np.save("featimp-" + self.method + ".npy", self.imp)

        if self.verbose:
            fprint("Feature importances:")
            for f in range(self.X_train.shape[1]):
                fprint(f"{f}. feat. {self.idcs[f]} ({self.imp[self.idcs[f]]})")

        # Save model
        pickle.dump(self.clf,
                    open(self.time_id + "-" + self.method + "-fit.pkl", "wb"))

    def predict(self):

        if self.verbose:
            print("Making predictions...")

        self.y_pred = self.clf.predict(self.X_test)
        accuracy = accuracy_score(self.y_pred, self.y_test)
        fprint(f"Test accuracy score: {np.around(accuracy, decimals=3)}")

        plot_confusion_matrix(self.y_test,
                              self.y_pred,
                              analysis_id=self.time_id)

    def gridsearch(self, parameters=None, cv=5, load_search=None):
        """Performing a grid search for optimal parameters.

        Parameters
        ----------
        parameters : dict
            Dictionary with the parameters to be tested in the grid search.
        cv : int
            Number of folds in the cross-validation.
        load_search : pickle dump
            The search model from a potential previous grid search, to avoid
            doing a new grid search. If None, a new grid search is performed.


        """

        if load_search is None:
            if parameters is None:
                parameters = [{
                    "learning_rate": [1, 0.5, 0.1],
                    "n_estimators": [100, 150, 200],
                    self.max_depth_str: [5, 7, 9, 11],
                }]

            self.search = GridSearchCV(
                self.clf,
                param_grid=parameters,
                cv=cv,
                n_jobs=-1,
                verbose=6,
                return_train_score=True,
            )
            self.search.fit(self.X_train, self.y_train)

            # Save model
            pickle.dump(
                self.search,
                open(self.time_id + "-" + self.method + "-search.pkl", "wb"),
            )

        else:
            self.search = pickle.load(open(load_search, "rb"))

        # Save results from grid search and print to terminal
        cv_results = pd.DataFrame(self.search.cv_results_)
        cv_results.to_csv(f"{self.time_id}-gridsearch.csv")
        report(self.search.cv_results_)

        # Overwriting parameters to the best parameters found by search
        self.learning_rate = self.search.best_params_["learning_rate"]
        self.n_estimators = self.search.best_params_["n_estimators"]
        self.max_depth = self.search.best_params_[self.max_depth_str]
Beispiel #22
0
def main(
    trainXFile="/home/kiran/kdd/trainXall.csv",
    trainYFile="/home/kiran/kdd/trainY.csv",
    validXFile="/home/kiran/kdd/validXall.csv",
    validYFile="/home/kiran/kdd/validY.csv",
    testXFile="/home/kiran/kdd/testXall.csv",
    n_estimators=150,
    n_estimators_step=150,
    learning_rate=0.005,
    max_features=30,
    max_depth=11,
    verbose=0,
    dump_file="/home/kiran/kdd/pymodels/gbm_all_0.005_30_11.pkl",
    outputFile="prediction.txt",
    max_trees=300,
    random_state=11,
):
    actual = np.loadtxt(validYFile, delimiter=",")
    trainY = np.loadtxt(trainYFile, delimiter=",")
    train = pd.read_csv(trainXFile)
    valid = pd.read_csv(validXFile)
    # trainY = pd.read_csv (trainYFile)
    # trainY = trainY.ix [:,'x']
    # validY = pd.read_csv (validYFile)
    # actual = validY.ix [:,'x']
    # validWeights = pd.read_csv (validFileWeights)
    # validWeights = validWeights.ix [:,'x']
    # actual = actual.get_values ()

    gbm = GradientBoostingClassifier(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_features=max_features,
        max_depth=max_depth,
        random_state=random_state,
        verbose=verbose,
    )
    gbm.fit(train, trainY)
    prediction_valid = gbm.predict_proba(valid)[:, 1]

    # gbm = joblib.load ( '/home/kiran/kdd/pymodels/gbmmore.pkl')
    bestAUC = 0
    # myAUC = kdd_metrics (actual, prediction_valid, validWeights)
    myAUC = metrics.roc_auc_score(actual, prediction_valid)
    bestAUC = 0

    while myAUC >= bestAUC:
        n_estimators = n_estimators + n_estimators_step
        gbm.set_params(n_estimators=n_estimators, warm_start=True)
        gbm.fit(train, trainY)
        prediction_valid = gbm.predict_proba(valid)[:, 1]
        myAUC = metrics.roc_auc_score(actual, prediction_valid)
        print "bestAUC: %f myAUC: %f" % (bestAUC, myAUC)
        improvement = myAUC - bestAUC
        if improvement < 0.0000000001:
            break
        if n_estimators > max_trees:
            break
        bestAUC = myAUC
        bestPrediction = prediction_valid
        joblib.dump(gbm, dump_file)
        print "bestAUC: %f improvement: %f" % (bestAUC, improvement)

    myAUC = metrics.roc_auc_score(actual, bestPrediction)
    print "AUC: %f bestPrediction: %f" % (myAUC, improvement)
    test = pd.read_csv(testXFile)
    prediction_test1 = gbm.predict_proba(test)[:, 1]
    prediction_total = np.concatenate((bestPrediction, prediction_test1), axis=0)
    np.savetxt(outputFile, prediction_total, delimiter=",")
Beispiel #23
0
class BADS(object):
    def __init__(self):
        # Data
        self.X_train = None
        self.y_train = None
        self.X_train_cv = None
        self.X_valid_cv = None
        self.y_train_cv = None
        self.y_valid_cv = None
        self.column_names = None

        self.X_test = None
        self.yhat = None

        self.thresholds = None

        # Classifiers
        self.clf = None
        self.clf_cv = None

        # Cost matrix
        self.cm = np.array([[3., 0.], [-10., 0.]])

        # variables to be set
        self.rs = 90049
        self.save_model = False

        ######### Feature Selection #########
        self.manual_features_to_remove = [
            "x_order_date_num", "x_account_creation_date_num",
            "x_deliverydate_estimated_num", "x_deliverydate_actual_num"
        ]
        self.feature_correlation_removal = False
        self.feature_correlation_threshold = 0.7
        self.automatic_feature_selection = False
        self.automatic_feature_threshold = 0.005

        ######### Oversampling #########
        # non-standard package: http://contrib.scikit-learn.org/imbalanced-learn/index.html
        self.oversample_method = "none"

        ######### Cross-Valdiation #########
        self.do_cv = False  # this takes a long time
        self.cv_num_folds = 4
        self.cv_validation_frac = 0.15
        self.cv_rs_iters = 20
        self.cost_func = self.bads_costs  # bads_costs, roc_auc_score
        self.score_func = self.bads_scorer  # bads_scorer, roc_auc_score
        self.set_model("rf")  # "rf" or "gbc" or "linear"

    def set_model(self, model_to_use=None):
        """Set the model to use from a pre-set list.

        One could set these variables manually but for ease of use, we have created 
        a list of predefined models to ease of use.

        Parameters
        ----------
        model_to_use: a string of the model to be used.
            If None, use the objects models.

        """
        if model_to_use == None:
            model_to_use = self.model_to_use
        ######### Model Selection #########
        if model_to_use == "rf":
            # Random Forest Classifier
            from sklearn.ensemble import RandomForestClassifier
            self.clf = RandomForestClassifier(random_state=self.rs)
            self.automatic_feature_selection_params = {
                'n_estimators': 250,
                'verbose': 0,
                'n_jobs': 3
            }
            self.clf_default_params = {
                'min_samples_split': 2,
                'n_estimators': 250,
                'min_samples_leaf': 9,
                'criterion': 'gini',
                'verbose': 0,
                'oob_score': True,
                'n_jobs': 3
            }
            self.cv_param_grid = {
                'n_estimators': [100, 250, 500],
                'min_samples_split': [2, 4, 8],
                'min_samples_leaf': [1, 3, 9],
                'n_jobs': [3]
            }
        elif model_to_use == "gbc":
            # Gradient Boosting Classifier
            from sklearn.ensemble import GradientBoostingClassifier
            self.clf = GradientBoostingClassifier(random_state=self.rs)
            self.automatic_feature_selection_params = {
                'n_estimators': 50,
                'verbose': 1
            }
            self.clf_default_params = {
                'learning_rate': 0.1,
                'max_depth': 3,
                'n_estimators': 100,
                'verbose': 1
            }
            self.cv_param_grid = {
                'n_estimators': [50, 100, 250, 500],
                'learning_rate': [0.05, 0.1, .25],
                'max_depth': [3, 5, 9]
            }
        elif model_to_use == "linear":
            # Logistic Regression Classifier
            from sklearn import linear_model
            self.clf = linear_model.LogisticRegression()
            self.clf_default_params = {'penalty': 'l1'}
            self.cv_param_grid = {
                'penalty': ['l1', 'l2'],
                'C': 2**np.linspace(-3, 5, 17),
                'n_jobs': [3]
            }
        else:
            print("Please Set The Model")

    def simple_oversample_idx(self, y):
        """Simple oversample to equalize the two groups.

        Parameters
        ----------
        y: an array of the true target variable values.

        """

        y_idx_0 = np.where(y == 0)[0]
        y_idx_1 = np.random.choice(np.where(y == 1)[0],
                                   size=y_idx_0.shape[0],
                                   replace=True)
        ret_cust_idx = []
        ret_cust_idx.extend(y[y_idx_0])
        ret_cust_idx.extend(y[y_idx_1])
        return (ret_cust_idx)

    def bads_costs(self, y_t, yhat):
        """Return the profit per customer.

        This function calculates the profit per customer based on the matrix given 
        to us in the assignment.

        Parameters
        ----------
        y_t: an array of true target variable values.

        yhat: an array of binary predictions from our model.

        """
        N = yhat.shape[0]
        C = confusion_matrix(y_t, yhat)
        return (np.multiply(C, self.cm).sum() / N)

    def bads_scorer(self, y_t, yhat_prob):
        """Return the maximum profit per customer

        This function does a simple line search using the assignment cost/profit 
        function.  For each threshold, we create a vector of binary predictions 
        and then we calculate the profit per customer using these binary predictions.  
        The maximum value is returned. 
        
        Note: All threshold levels at which this maximum occurred are saved to 
              the object.

        Parameters
        ----------
        y_t: an array of true target variable values.

        yhat_prob: an array of probability predictions from our model.

        """
        thresholds = np.linspace(0.01, 0.99)
        costs = [
            self.bads_costs(y_t, yhat_prob[:, 1] > threshold)
            for threshold in thresholds
        ]
        self.thresholds.append(thresholds[np.argmax(costs)])
        return (np.max(costs))

    def find_corr_features(self, df, threshold=0.7):
        """Return list of column names.

        This function calculates the simple correlation matrix between features 
        and based on a given threshold (default: abs(0.7)) removes the feature 
        that comes later on in the feature list.  This will prioritize original 
        features over features we have created. 

        Parameters
        ----------
        df: a pandas dataframe (either the train or test set)

        threshold: a scalar value above whose absolute value features will be 
                   considered "highly correllated".

        """
        cols = df.columns.values.tolist()
        corr_mat = df.corr()
        corr_items = np.where(np.abs(np.triu(corr_mat, k=1)) > threshold)
        cols_removed = []
        for corr_item in list(
                set([cols[max(item)] for item in zip(*corr_items)])):
            cols_removed.append(corr_item)
            cols.remove(corr_item)
        print("Removing Columns:", ", ".join(cols_removed))
        return (cols)

    def loadDataset(self, df, date_to_int=True, use_woe=True):
        """Return pandas dataframe.

        The purpose of this function is to put our dataframe into a form that is 
        as close as possible to our R dataframe that we've used in our data 
        processing steps.
        
        Additionally, we have calculated dates, added an "is_weekday" dummy variable 
        and chosen to use or not use the Weight of Evidence variables.

        Parameters
        ----------
        df: a pandas dataframe (either the train or test set)

        date_to_int: a logical value to decide whether to convert dates to integers 
                     based on the epoch date of January 1st, 2013.

        use_woe: a logical value whether to use Weight of Evidence converted 
                 variables or use the original variables as k-1 dummies.

        """

        # remove NA
        df.fillna(-99, inplace=True)
        # Convert Dates
        df.order_date = pd.to_datetime(df.order_date, format='%Y-%m-%d')
        df.account_creation_date = pd.to_datetime(df.account_creation_date,
                                                  format='%Y-%m-%d')
        df.deliverydate_estimated = pd.to_datetime(df.deliverydate_estimated,
                                                   format='%Y-%m-%d')
        df.deliverydate_actual = pd.to_datetime(df.deliverydate_actual,
                                                format='%Y-%m-%d')
        # Create weekday dummy for order_date
        df['x_order_date_is_weekday'] = df.order_date.dt.dayofweek < 5
        if date_to_int:
            epoch_date = pd.Timestamp("2013-01-01")
            df.order_date = (df.order_date -
                             epoch_date).astype('timedelta64[D]').astype(int)
            df.account_creation_date = (
                df.account_creation_date -
                epoch_date).astype('timedelta64[D]').astype(int)
            df.deliverydate_estimated = (
                df.deliverydate_estimated -
                epoch_date).astype('timedelta64[D]').astype(int)
            df.deliverydate_actual = (
                df.deliverydate_actual -
                epoch_date).astype('timedelta64[D]').astype(int)
        # Convert Categories (factors in R lingo)
        cols_to_categorize = [
            "model", "form_of_address", "email_domain", "postcode_invoice",
            "postcode_delivery", "payment", "advertising_code",
            "x_order_date_yearweek"
        ]
        # Categorize _bin columns
        cols = df.columns
        cols_to_categorize.extend(
            cols[cols.str.contains("_bin")].values.tolist())
        for col_to_cat in cols_to_categorize:
            #print(col_to_cat)
            if (col_to_cat in df.columns.values):
                df[col_to_cat] = df[col_to_cat].astype('category')

        return (df)

    def create_datasets(self,
                        use_woe=False,
                        fp_train="output/train_cleaned_woe.csv",
                        fp_test="output/test_cleaned_woe.csv"):
        """Load datasets.

        This is a convenience function that loads both the training and testing 
        datasets and implements any feature selection that we've decided to use.  
        Additionally, we impose the column structure of the train set on the test 
        set.  Implicitly, this adds and removes appropriate "factor levels" and 
        gives any added factor level a default of 0.

        Parameters
        ----------
        use_woe: a logical value whether to use Weight of Evidence converted 
                 variables or use the original variables as k-1 dummies.

        fp_train: a string of the train set CSV file

        fp_test: a string of the test set CSV file

        """

        train = pd.read_csv(fp_train, sep=";", decimal=',', index_col="ID")
        train = self.loadDataset(train)
        # Create Feature List
        features_to_use = train.columns.values.tolist()
        features_to_use.remove("return_customer")
        cols_woe_removal = [col for col in features_to_use if "x_woe_" in col]
        if use_woe:
            cols_woe_removal = [
                col.replace("x_woe_", "") for col in cols_woe_removal
            ]
        self.manual_features_to_remove.extend(cols_woe_removal)

        for ftr in self.manual_features_to_remove:
            if ftr in features_to_use:
                features_to_use.remove(ftr)
            elif "x_" + ftr in features_to_use:
                features_to_use.remove("x_" + ftr)
        # remove dates if not converted to ints
        for date_feature, v in train.dtypes.items():
            if v == "datetime64[ns]":
                features_to_use.remove(date_feature)
        train = train[features_to_use + ["return_customer"]]
        # Visualize Correlation before splitting out dummy variables
        if self.feature_correlation_removal:
            sns.heatmap(train.drop("return_customer", 1).corr())
            plt.show()
        # Split out dummy variables
        train = pd.get_dummies(train)
        # feature Correlation Removal
        if self.feature_correlation_removal:
            print("Removing correlated features...")
            noncorr_cols = self.find_corr_features(
                train.drop("return_customer", 1),
                self.feature_correlation_threshold)
            train = train[noncorr_cols + ["return_customer"]]
        # set train datasets
        self.X_train, self.y_train = train.drop(
            "return_customer", 1).values, train["return_customer"].values
        self.column_names = train.columns

        test = pd.read_csv(fp_test, sep=";", decimal=',', index_col="ID")
        test = self.loadDataset(test)
        test = pd.get_dummies(test)
        # The following line gives the test set the same columns as the training set.
        # This simultaneously adds columns to the test set and sets the values in those columns to 0 and
        # drops any columns in the test set that did not exist in the training set.
        print("Imposing train column structure on test...")
        test = test.reindex(columns=self.column_names, fill_value=0)
        test.drop("return_customer", 1, inplace=True)
        # set test dataset
        self.X_test = test.values
        self.X_train_cv, self.X_valid_cv, self.y_train_cv, self.y_valid_cv = train_test_split(
            self.X_train,
            self.y_train,
            test_size=self.cv_validation_frac,
            stratify=self.y_train,
            random_state=self.rs)

    def oversample(self):
        """Oversample datasets.

        Simple: This just normalizes the number of data points to make the two 
                classes equal sizes.  Samples are duplicated at random with 
                replacement.
                
        SMOTE: SMOTE oversampling on the minority class to an equal weight as the 
               majority class.
        
        SMOTE+Tomek: This option oversamples the minority class and then removes 
                     data points which are determined to be Tomek links.
        """
        if self.oversample_method == "simple":
            # oversampling with replacement of the minority group to equalize the size of the minority and
            # majority group
            print("Simple oversampling...")
            # Create the Hyper-Parameter Cross-Validation train and test sets
            ret_cust_idx_cv = self.simple_oversample_idx(self.y_train_cv)
            self.X_train_cv, self.y_train_cv = self.X_train_cv[
                ret_cust_idx_cv, :], self.y_train_cv[ret_cust_idx_cv]
            # Create the full train and test sets
            ret_cust_idx = simple_oversample_idx(self.y_train)
            self.X_train, self.y_train = self.X_train[
                ret_cust_idx, :], self.y_train[ret_cust_idx]
        elif self.oversample_method == "SMOTE":
            # https://www.jair.org/media/953/live-953-2037-jair.pdf
            from imblearn.over_sampling import SMOTE

            print("SMOTE oversampling...")
            sm = SMOTE(kind='regular', random_state=self.rs)
            # Create the Hyper-Parameter Cross-Validation train and test sets
            self.X_train_cv, self.y_train_cv = sm.fit_sample(
                self.X_train_cv, self.y_train_cv)
            # Create the full train and test sets
            self.X_train, self.y_train = sm.fit_sample(self.X_train,
                                                       self.y_train)
        elif self.oversample_method == "SMOTETomek":
            from imblearn.combine import SMOTETomek

            print("SMOTE + Tomek Links oversampling...")
            sm = SMOTETomek(random_state=rs)
            # Create the Hyper-Parameter Cross-Validation train and test sets
            self.X_train_cv, self.y_train_cv = sm.fit_sample(
                self.X_train_cv, self.y_train_cv)
            # Create the full train and test sets
            self.X_train, self.y_train = sm.fit_sample(self.X_train,
                                                       self.y_train)
        else:
            print("No oversampling...")

    def automagic_feature_selection(self):
        """Prune data sets based on algorithmic feature selection.

        We use a particular threshold to keep certain columns based on the 
        "feature importances" of tree-based classifiers (i.e. random forest or 
        gradient boosted trees)
        
        """
        if self.automatic_feature_selection:
            print("Starting automatic feature selection...")
            # this takes about 10 minutes to run
            self.clf.set_params(**self.automatic_feature_selection_params)
            self.clf.fit(self.X_train, self.y_train)
            important_features = np.where(
                self.clf.feature_importances_ >
                self.automatic_feature_threshold)[0].tolist()
            important_features_labels = self.column_names[important_features]
            print("High Importance Features:",
                  ", ".join(important_features_labels.tolist()))
            np.savetxt("output/optimal_features.csv",
                       important_features_labels.values,
                       fmt="%s",
                       delimiter=";")

            self.X_train, self.X_test = self.X_train[:,
                                                     important_features], self.X_test[:,
                                                                                      important_features]
            self.X_train_cv, self.X_valid_cv = self.X_train_cv[:,
                                                               important_features], self.X_valid_cv[:,
                                                                                                    important_features]

        else:
            print("No automatic feature selection...")

    def run_model(self, fp_output="output/test_return_customer.csv"):
        """Do hyperparameter search, if desired, and then make prediction on test set.

        We do our hyper parameter search and make our prediction on the test set.  
        At this time, we print out diagnostics and results throughout the process.  
        
        """
        self.thresholds = []
        if self.do_cv:
            # this can take a LONG time
            print("Searching for best parameters with CV search...")
            self.clf_cv = RandomizedSearchCV(self.clf,
                                             self.cv_param_grid,
                                             scoring=make_scorer(
                                                 self.score_func,
                                                 needs_proba=True),
                                             cv=self.cv_num_folds,
                                             n_iter=self.cv_rs_iters,
                                             random_state=self.rs,
                                             verbose=1)
            self.clf_cv.fit(self.X_train_cv, self.y_train_cv)
            #clf_rf_cv.cv_results_
            joblib.dump(self.clf_cv.cv_results_,
                        'output/clf_rf_cv.results.pkl')
            print("Cross Valdiation Report:")
            print("Best Params:", self.clf_cv.best_params_)
            print("Best Score:", self.clf_cv.best_score_)
            # Plot Expected ROI per Customer
            plt.errorbar(range(self.cv_rs_iters),
                         self.clf_cv.cv_results_["mean_test_score"],
                         yerr=self.clf_cv.cv_results_["std_test_score"],
                         fmt="o")
            plt.title("Errorbar Plot of Hyper Parameter Search")
            plt.ylabel("Average ROI")
            plt.xlabel("Iteration (See Table Below)")
            plt.margins(0.03)
            plt.show()

            print(pd.DataFrame(list(self.clf_cv.cv_results_['params'])))

            # Train and Validate a random forest classifier with the best parameters
            yhat_valid_prob = self.clf_cv.predict_proba(self.X_valid_cv)

            params_star = self.clf_cv.best_params_
            self.clf.set_params(**params_star)
        else:
            self.clf.set_params(**self.clf_default_params)
            self.clf.fit(self.X_train_cv, self.y_train_cv)
            yhat_valid_prob = self.clf.predict_proba(self.X_valid_cv)

        print("Validation Summary:")
        print("Calculate Optimal Threshold")
        thresholds = np.linspace(0.01, 0.99, 197)
        costs = [
            self.bads_costs(self.y_valid_cv, yhat_valid_prob[:, 1] > threshold)
            for threshold in thresholds
        ]
        threshold_star = thresholds[np.argmax(costs)]
        # Plot
        plt.plot(thresholds, costs)
        plt.title("Threshold Search")
        plt.ylabel("Average ROI")
        plt.xlabel("Threshold of return_customer = 1")
        plt.show()
        print("Threshold:", threshold_star)
        yhat_valid = yhat_valid_prob[:, 1] > threshold_star
        print("Average ROI:", self.cost_func(self.y_valid_cv, yhat_valid))
        print("ROC Score:",
              roc_auc_score(self.y_valid_cv, yhat_valid_prob[:, 1]))
        print("Validation Return Customers: {} of {} ({}%)".format(
            np.sum(yhat_valid), len(yhat_valid),
            np.round(100 * np.sum(yhat_valid) / len(yhat_valid), 2)))
        print(confusion_matrix(self.y_valid_cv, yhat_valid))
        # Train model with all data and use on the Test set
        self.clf.fit(self.X_train, self.y_train)
        yhat_test_proba = self.clf.predict_proba(self.X_test)
        yhat_test = yhat_test_proba[:, 1] > threshold_star
        preds = pd.DataFrame(np.c_[np.arange(51885, 51885 +
                                             yhat_test.shape[0]),
                                   yhat_test.astype(int)],
                             columns=["ID", "return_customer"])
        preds.to_csv(fp_output, index=False)
        preds_probs = pd.DataFrame(np.c_[np.arange(51885, 51885 +
                                                   yhat_test.shape[0]),
                                         yhat_test_proba[:, 1]],
                                   columns=["ID", "return_customer"])
        preds_probs.to_csv(fp_output.split(".")[0] + "_probs.csv", index=False)
        print("Testing Return Customers: {} of {} ({}%)".format(
            np.sum(yhat_test), len(yhat_test),
            np.round(100 * np.sum(yhat_test) / len(yhat_test), 2)))
        self.yhat = yhat_test
        if self.save_model:
            joblib.dump(self.clf, 'output/model_final.pkl')
            #clf_rf = joblib.load('output/model_final.pkl')

    def pca_analysis(self, X, y, num_PC=5, recalc_PC=True):
        """Visualize data with predictions with Principal Component Analysis.

        Due to the high dimentionality of our data, we found it hard to conceptualize 
        without reducing the dimensionality.  We do a principal component analysis 
        and create a scatter matrix of the results.  Importantly, we allow the PCA 
        to be fitted with the training data and then applied on the testing data, 
        so we will see the two data sets rotated in the same manner.
        
        Parameters
        ----------
        X: numpy array of train or test data

        y: numpy vector of true target values or model predictions

        num_PC: number of principal components to use

        recalc_PC: a logical value
            used to decide whether to recalculate the principal components including 
            the rotation vectors.  If false, one will rotate the data-based on the 
            rotations of the previously calculated principal components.

        """
        # PCA Analysis of Results
        from sklearn.decomposition import PCA
        from sklearn.preprocessing import scale
        train_scaled = scale(X)

        if recalc_PC:
            self.pca = PCA(n_components=num_PC)
            self.pca.fit(train_scaled)
        else:
            print("Using previous eigenvectors to rotate data...")
        train_rotated = self.pca.transform(train_scaled)
        df_train = pd.DataFrame(train_rotated)
        df_train["colors"] = [
            "returning" if y_i else "non-returning" for y_i in y
        ]
        sns.pairplot(df_train,
                     hue="colors",
                     diag_kind="kde",
                     vars=range(num_PC))
        plt.show()
Beispiel #24
0
gc.collect()

param = {  # init the hyperparams of GBDT
    'learning_rate': 0.2,
    'n_estimators': 100,  # number of trees here
    'max_depth': 8,  # set max_depth of a tree
    'min_samples_split': 20,
    'min_samples_leaf': 10,
    'subsample': 0.01,
    'max_leaf_nodes': None,  # set max leaf nodes of a tree
    'random_state': 1,
    'verbose': 0
}

gbdt_model = GradientBoostingClassifier()
gbdt_model.set_params(**param)

## fitting
gbdt_model.fit(X_train_gbdt, y_train_gbdt)

## log-loss of training
y_pred_gbdt = gbdt_model.predict_proba(X_train_gbdt)[:, 1]
log_loss_gbdt = log_loss(y_train_gbdt, y_pred_gbdt)
print('log loss of GBDT on train set: %.5f' % log_loss_gbdt)

y_pred_gbdt = gbdt_model.predict_proba(X_valid)[:, 1]
log_loss_gbdt = log_loss(y_valid, y_pred_gbdt)
print('log loss of GBDT on valid set: %.5f' % log_loss_gbdt)

## store the pre-trained gbdt_model
pickle.dump(gbdt_model, open(fp_gbdt_model, 'wb'))
Beispiel #25
0
train_data = data.loc[rindex, :]
test_data = data.drop(rindex)
train_label = train_data.target
train_data = train_data.drop(['target', 'id'], axis=1)
test_label = test_data.target
test_data = test_data.drop(['target', 'id'], axis=1)

gb_otto = GradientBoostingClassifier(n_estimators=100, verbose=1, warm_start=1)
gb_otto.fit(train_data, train_label)
test_prob = gb_otto.predict_proba(test_data)
train_prob = gb_otto.predict_proba(train_data)
print 'The logloss score of test data:', logloss(test_label, test_prob)
print 'The logloss score of train data:', logloss(train_label, train_prob)

gb_otto.set_params(n_estimators=150)
gb_otto.fit(train_data, train_label)
test_prob = gb_otto.predict_proba(test_data)
train_prob = gb_otto.predict_proba(train_data)
print 'The logloss score of test data:', logloss(test_label, test_prob)
print 'The logloss score of train data:', logloss(train_label, train_prob)

gb_otto.set_params(n_estimators=200)
gb_otto.fit(train_data, train_label)
test_prob = gb_otto.predict_proba(test_data)
train_prob = gb_otto.predict_proba(train_data)
print 'The logloss score of test data:', logloss(test_label, test_prob)
print 'The logloss score of train data:', logloss(train_label, train_prob)

gb_otto.set_params(n_estimators=250)
gb_otto.fit(train_data, train_label)
Beispiel #26
0
class RuleFit(BaseEstimator, TransformerMixin):
    """Rulefit class


    Parameters
    ----------
        tree_size:      Number of terminal nodes in generated trees. If exp_rand_tree_size=True,
                        this will be the mean number of terminal nodes.
        sample_fract:   fraction of randomly chosen training observations used to produce each tree.
                        FP 2004 (Sec. 2)
        max_rules:      approximate total number of rules generated for fitting. Note that actual
                        number of rules will usually be lower than this due to duplicates.
        memory_par:     scale multiplier (shrinkage factor) applied to each new tree when
                        sequentially induced. FP 2004 (Sec. 2)
        rfmode:         'regress' for regression or 'classify' for binary classification.
        lin_standardise: If True, the linear terms will be standardised as per Friedman Sec 3.2
                        by multiplying the winsorised variable by 0.4/stdev.
        lin_trim_quantile: If lin_standardise is True, this quantile will be used to trim linear
                        terms before standardisation.
        exp_rand_tree_size: If True, each boosted tree will have a different maximum number of
                        terminal nodes based on an exponential distribution about tree_size.
                        (Friedman Sec 3.3)
        model_type:     'r': rules only; 'l': linear terms only; 'rl': both rules and linear terms
        random_state:   Integer to initialise random objects and provide repeatability.
        tree_generator: Optional: this object will be used as provided to generate the rules.
                        This will override almost all the other properties above.
                        Must be GradientBoostingRegressor or GradientBoostingClassifier, optional (default=None)
        tol:            The tolerance for the optimization for LassoCV or LogisticRegressionCV:
                        if the updates are smaller than `tol`, the optimization code checks the dual
                        gap for optimality and continues until it is smaller than `tol`.
        max_iter:       The maximum number of iterations for LassoCV or LogisticRegressionCV.
        n_jobs:         Number of CPUs to use during the cross validation in LassoCV or
                        LogisticRegressionCV. None means 1 unless in a joblib.parallel_backend
                        context. -1 means using all processors.

    Attributes
    ----------
    rule_ensemble: RuleEnsemble
        The rule ensemble

    feature_names: list of strings, optional (default=None)
        The names of the features (columns)

    """
    def __init__(self,
                 tree_size=4,
                 sample_fract='default',
                 max_rules=2000,
                 memory_par=0.01,
                 tree_generator=None,
                 rfmode='regress',
                 lin_trim_quantile=0.025,
                 lin_standardise=True,
                 exp_rand_tree_size=True,
                 model_type='rl',
                 Cs=None,
                 cv=3,
                 tol=0.0001,
                 max_iter=None,
                 n_jobs=None,
                 random_state=None):
        self.tree_generator = tree_generator
        self.rfmode = rfmode
        self.lin_trim_quantile = lin_trim_quantile
        self.lin_standardise = lin_standardise
        self.winsorizer = Winsorizer(trim_quantile=lin_trim_quantile)
        self.friedscale = FriedScale(self.winsorizer)
        self.stddev = None
        self.mean = None
        self.exp_rand_tree_size = exp_rand_tree_size
        self.max_rules = max_rules
        self.sample_fract = sample_fract
        self.max_rules = max_rules
        self.memory_par = memory_par
        self.tree_size = tree_size
        self.random_state = random_state
        self.model_type = model_type
        self.cv = cv
        self.tol = tol
        # LassoCV default max_iter is 1000 while LogisticRegressionCV 100.
        self.max_iter = 1000 if 'regress' else 100
        self.n_jobs = n_jobs
        self.Cs = Cs

    def fit(self, X, y=None, feature_names=None):
        """Fit and estimate linear combination of rule ensemble

        """
        ## Enumerate features if feature names not provided
        N = X.shape[0]
        if feature_names is None:
            self.feature_names = [
                'feature_' + str(x) for x in range(0, X.shape[1])
            ]
        else:
            self.feature_names = feature_names
        if 'r' in self.model_type:
            ## initialise tree generator
            if self.tree_generator is None:
                n_estimators_default = int(
                    np.ceil(self.max_rules / self.tree_size))
                self.sample_fract_ = min(0.5, (100 + 6 * np.sqrt(N)) / N)
                if self.rfmode == 'regress':
                    self.tree_generator = GradientBoostingRegressor(
                        n_estimators=n_estimators_default,
                        max_leaf_nodes=self.tree_size,
                        learning_rate=self.memory_par,
                        subsample=self.sample_fract_,
                        random_state=self.random_state,
                        max_depth=100)
                else:
                    self.tree_generator = GradientBoostingClassifier(
                        n_estimators=n_estimators_default,
                        max_leaf_nodes=self.tree_size,
                        learning_rate=self.memory_par,
                        subsample=self.sample_fract_,
                        random_state=self.random_state,
                        max_depth=100)

            if self.rfmode == 'regress':
                if type(self.tree_generator) not in [
                        GradientBoostingRegressor, RandomForestRegressor
                ]:
                    raise ValueError(
                        "RuleFit only works with RandomForest and BoostingRegressor"
                    )
            else:
                if type(self.tree_generator) not in [
                        GradientBoostingClassifier, RandomForestClassifier
                ]:
                    raise ValueError(
                        "RuleFit only works with RandomForest and BoostingClassifier"
                    )

            ## fit tree generator
            if not self.exp_rand_tree_size:  # simply fit with constant tree size
                self.tree_generator.fit(X, y)
            else:  # randomise tree size as per Friedman 2005 Sec 3.3
                np.random.seed(self.random_state)
                tree_sizes = np.random.exponential(
                    scale=self.tree_size - 2,
                    size=int(np.ceil(self.max_rules * 2 / self.tree_size)))
                tree_sizes = np.asarray([
                    2 + np.floor(tree_sizes[i_])
                    for i_ in np.arange(len(tree_sizes))
                ],
                                        dtype=int)
                i = int(len(tree_sizes) / 4)
                while np.sum(tree_sizes[0:i]) < self.max_rules:
                    i = i + 1
                tree_sizes = tree_sizes[0:i]
                self.tree_generator.set_params(warm_start=True)
                curr_est_ = 0
                for i_size in np.arange(len(tree_sizes)):
                    size = tree_sizes[i_size]
                    self.tree_generator.set_params(n_estimators=curr_est_ + 1)
                    self.tree_generator.set_params(max_leaf_nodes=size)
                    random_state_add = self.random_state if self.random_state else 0
                    self.tree_generator.set_params(
                        random_state=i_size + random_state_add
                    )  # warm_state=True seems to reset random_state, such that the trees are highly correlated, unless we manually change the random_sate here.
                    self.tree_generator.get_params()['n_estimators']
                    self.tree_generator.fit(np.copy(X, order='C'),
                                            np.copy(y, order='C'))
                    curr_est_ = curr_est_ + 1
                self.tree_generator.set_params(warm_start=False)
            tree_list = self.tree_generator.estimators_
            if isinstance(self.tree_generator,
                          RandomForestRegressor) or isinstance(
                              self.tree_generator, RandomForestClassifier):
                tree_list = [[x] for x in self.tree_generator.estimators_]

            ## extract rules
            self.rule_ensemble = RuleEnsemble(tree_list=tree_list,
                                              feature_names=self.feature_names)

            ## concatenate original features and rules
            X_rules = self.rule_ensemble.transform(X)

        ## standardise linear variables if requested (for regression model only)
        if 'l' in self.model_type:

            ## standard deviation and mean of winsorized features
            self.winsorizer.train(X)
            winsorized_X = self.winsorizer.trim(X)
            self.stddev = np.std(winsorized_X, axis=0)
            self.mean = np.mean(winsorized_X, axis=0)

            if self.lin_standardise:
                self.friedscale.train(X)
                X_regn = self.friedscale.scale(X)
            else:
                X_regn = X.copy()

        ## Compile Training data
        X_concat = np.zeros([X.shape[0], 0])
        if 'l' in self.model_type:
            X_concat = np.concatenate((X_concat, X_regn), axis=1)
        if 'r' in self.model_type:
            if X_rules.shape[0] > 0:
                X_concat = np.concatenate((X_concat, X_rules), axis=1)

        ## fit Lasso
        if self.rfmode == 'regress':
            if self.Cs is None:  # use defaultshasattr(self.Cs, "__len__"):
                n_alphas = 100
                alphas = None
            elif hasattr(self.Cs, "__len__"):
                n_alphas = None
                alphas = 1. / self.Cs
            else:
                n_alphas = self.Cs
                alphas = None
            self.lscv = LassoCV(n_alphas=n_alphas,
                                alphas=alphas,
                                cv=self.cv,
                                max_iter=self.max_iter,
                                tol=self.tol,
                                n_jobs=self.n_jobs,
                                random_state=self.random_state)
            self.lscv.fit(X_concat, y)
            self.coef_ = self.lscv.coef_
            self.intercept_ = self.lscv.intercept_
        else:
            Cs = 10 if self.Cs is None else self.Cs
            self.lscv = LogisticRegressionCV(Cs=Cs,
                                             cv=self.cv,
                                             penalty='l1',
                                             max_iter=self.max_iter,
                                             tol=self.tol,
                                             n_jobs=self.n_jobs,
                                             random_state=self.random_state,
                                             solver='liblinear')
            self.lscv.fit(X_concat, y)
            self.coef_ = self.lscv.coef_[0]
            self.intercept_ = self.lscv.intercept_[0]

        return self

    def predict(self, X):
        """Predict outcome for X

        """
        X_concat = np.zeros([X.shape[0], 0])
        if 'l' in self.model_type:
            if self.lin_standardise:
                X_concat = np.concatenate((X_concat, self.friedscale.scale(X)),
                                          axis=1)
            else:
                X_concat = np.concatenate((X_concat, X), axis=1)
        if 'r' in self.model_type:
            rule_coefs = self.coef_[-len(self.rule_ensemble.rules):]
            if len(rule_coefs) > 0:
                X_rules = self.rule_ensemble.transform(X, coefs=rule_coefs)
                if X_rules.shape[0] > 0:
                    X_concat = np.concatenate((X_concat, X_rules), axis=1)
        return self.lscv.predict(X_concat)

    def predict_proba(self, X):
        """Predict outcome probability for X, if model type supports probability prediction method

        """

        if 'predict_proba' not in dir(self.lscv):

            error_message = '''
            Probability prediction using predict_proba not available for
            model type {lscv}
            '''.format(lscv=self.lscv)
            raise ValueError(error_message)

        X_concat = np.zeros([X.shape[0], 0])
        if 'l' in self.model_type:
            if self.lin_standardise:
                X_concat = np.concatenate((X_concat, self.friedscale.scale(X)),
                                          axis=1)
            else:
                X_concat = np.concatenate((X_concat, X), axis=1)
        if 'r' in self.model_type:
            rule_coefs = self.coef_[-len(self.rule_ensemble.rules):]
            if len(rule_coefs) > 0:
                X_rules = self.rule_ensemble.transform(X, coefs=rule_coefs)
                if X_rules.shape[0] > 0:
                    X_concat = np.concatenate((X_concat, X_rules), axis=1)
        return self.lscv.predict_proba(X_concat)

    def transform(self, X=None, y=None):
        """Transform dataset.

        Parameters
        ----------
        X : array-like matrix, shape=(n_samples, n_features)
            Input data to be transformed. Use ``dtype=np.float32`` for maximum
            efficiency.

        Returns
        -------
        X_transformed: matrix, shape=(n_samples, n_out)
            Transformed data set
        """
        return self.rule_ensemble.transform(X)

    def get_rules(self, exclude_zero_coef=False, subregion=None):
        """Return the estimated rules

        Parameters
        ----------
        exclude_zero_coef: If True (default), returns only the rules with an estimated
                           coefficient not equalt to  zero.

        subregion: If None (default) returns global importances (FP 2004 eq. 28/29), else returns importance over
                           subregion of inputs (FP 2004 eq. 30/31/32).

        Returns
        -------
        rules: pandas.DataFrame with the rules. Column 'rule' describes the rule, 'coef' holds
               the coefficients and 'support' the support of the rule in the training
               data set (X)
        """

        n_features = len(self.coef_) - len(self.rule_ensemble.rules)
        rule_ensemble = list(self.rule_ensemble.rules)
        output_rules = []
        ## Add coefficients for linear effects
        for i in range(0, n_features):
            if self.lin_standardise:
                coef = self.coef_[i] * self.friedscale.scale_multipliers[i]
            else:
                coef = self.coef_[i]
            if subregion is None:
                importance = abs(coef) * self.stddev[i]
            else:
                subregion = np.array(subregion)
                importance = sum(
                    abs(coef) *
                    abs([x[i] for x in self.winsorizer.trim(subregion)] -
                        self.mean[i])) / len(subregion)
            output_rules += [(self.feature_names[i], 'linear', coef, 1,
                              importance)]

        ## Add rules
        for i in range(0, len(self.rule_ensemble.rules)):
            rule = rule_ensemble[i]
            coef = self.coef_[i + n_features]

            if subregion is None:
                importance = abs(coef) * (rule.support *
                                          (1 - rule.support))**(1 / 2)
            else:
                rkx = rule.transform(subregion)
                importance = sum(
                    abs(coef) * abs(rkx - rule.support)) / len(subregion)

            output_rules += [(rule.__str__(), 'rule', coef, rule.support,
                              importance)]
        rules = pd.DataFrame(
            output_rules,
            columns=["rule", "type", "coef", "support", "importance"])
        if exclude_zero_coef:
            rules = rules.ix[rules.coef != 0]
        return rules
#----- parameters tuning of GBDT -----#

param = {  # init the hyperparams of GBDT
    'learning_rate': 0.2,
    'n_estimators': 100,  # number of trees here
    'max_depth': 8,  # set max_depth of a tree
    'min_samples_split': 20, 
    'min_samples_leaf': 10,
    'subsample': 0.01, 
    'max_leaf_nodes': None,  # set max leaf nodes of a tree
    'random_state': 1,
    'verbose': 0
    }

gbdt_model = GradientBoostingClassifier()
gbdt_model.set_params(**param)

'''
#----- parameters tuning of GBDT -----#
### n_estimators
log_loss_train = []
log_loss_valid = []
n_estimators = [10,20,30,40,50,60,70,80,90,100,120,140]
for nt in n_estimators:
    print('training: n_estimators = ', nt)
    
    param['n_estimators'] = nt
    gbdt_model.set_params(**param)
    gbdt_model.fit(X_train_gbdt, y_train_gbdt)

    # scores
Beispiel #28
0
    return gsearch.best_params_


if __name__ == '__main__':
    train, test = load_data()
    gbdt_model = GradientBoostingClassifier(learning_rate=0.1,
                                            n_estimators=100,
                                            subsample=1.0,
                                            min_samples_split=2,
                                            min_samples_leaf=1,
                                            max_depth=3,
                                            max_features='sqrt')
    model_fit(train, {'n_estimators': 100})

    param_search = {'n_estimators': range(50, 220, 30)}
    param_find = grid_search(gbdt_model, param_search, train)

    gbdt_model.set_params(**param_find)
    param_search = {
        'max_depth': range(3, 14, 2),
        'min_samples_split': range(1, 301, 50)
    }
    param_find = grid_search(gbdt_model, param_search, train)

    gbdt_model.set_params(**param_find)
    param_search = {'min_samples_leaf': range(1, 101, 20)}
    param_find = grid_search(gbdt_model, param_search, train)

    gbdt_model.set_params(**param_find)
    gbdt_model = model_fit(train, gbdt_model.get_params())
Beispiel #29
0
from sklearn.ensemble import GradientBoostingClassifier
gbmodel = GradientBoostingClassifier(random_state=0)
from sklearn.model_selection import GridSearchCV
param_grid = {
    'max_depth': [2., 3., 4.],
    'subsample': [0.8],
    'n_estimators': [100],
    'learning_rate': [0.2]
}
CV_gbmodel = GridSearchCV(estimator=gbmodel, param_grid=param_grid, cv=cross_validation_number_of_folds)
CV_gbmodel.fit(X_train, Y_train)
print("\n \n \n Gradient Boosting Classifier  \n")
print(CV_gbmodel.best_params_)
#use the best parameters
gbmodel = gbmodel.set_params(**CV_gbmodel.best_params_)
gbmodel.fit(X_train, Y_train)
Y_test_pred = gbmodel.predict(X_test)
accte = accuracy_score(Y_test, Y_test_pred)
report_x.loc[len(report_x)] = ['Gradient Boosting (grid)',
                          CV_gbmodel.cv_results_['mean_test_score'][CV_gbmodel.best_index_],
                          CV_gbmodel.cv_results_['std_test_score'][CV_gbmodel.best_index_], accte]
print(report_x.loc[len(report_x)-1])

print(CV_rfmodel.cv_results_['mean_test_score'])

plt.plot(range(int(param_grid['max_depth'][0]), int(param_grid['max_depth'][-1] + 1)), CV_gbmodel.cv_results_['mean_test_score'])
plt.xlim(int(param_grid['max_depth'][0]), int(param_grid['max_depth'][-1])-1)
plt.xticks(range(int(param_grid['max_depth'][0]), int(param_grid['max_depth'][-1])))
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
class RuleFitCustom(BaseEstimator, TransformerMixin):
    """Rulefit class


    Parameters
    ----------
        tree_size:      Number of terminal nodes in generated trees. If exp_rand_tree_size=True, 
                        this will be the mean number of terminal nodes.
        sample_fract:   fraction of randomly chosen training observations used to produce each tree. 
                        FP 2004 (Sec. 2)
        max_rules:      approximate total number of rules generated for fitting. Note that actual
                        number of rules will usually be lower than this due to duplicates.
        memory_par:     scale multiplier (shrinkage factor) applied to each new tree when 
                        sequentially induced. FP 2004 (Sec. 2)
        rfmode:         'regress' for regression or 'classify' for binary classification.
        lin_standardise: If True, the linear terms will be standardised as per Friedman Sec 3.2
                        by multiplying the winsorised variable by 0.4/stdev.
        lin_trim_quantile: If lin_standardise is True, this quantile will be used to trim linear 
                        terms before standardisation.
        exp_rand_tree_size: If True, each boosted tree will have a different maximum number of 
                        terminal nodes based on an exponential distribution about tree_size. 
                        (Friedman Sec 3.3)
        model_type:     'r': rules only; 'l': linear terms only; 'rl': both rules and linear terms
        random_state:   Integer to initialise random objects and provide repeatability.
        tree_generator: Optional: this object will be used as provided to generate the rules. 
                        This will override almost all the other properties above. 
                        Must be GradientBoostingRegressor or GradientBoostingClassifier, optional (default=None)

    Attributes
    ----------
    rule_ensemble: RuleEnsemble
        The rule ensemble

    feature_names: list of strings, optional (default=None)
        The names of the features (columns)

    """
    def __init__(self,
                 tree_size=4,
                 sample_fract='default',
                 max_rules=2000,
                 memory_par=0.01,
                 tree_generator=None,
                 rfmode='regress',
                 lin_trim_quantile=0.025,
                 lin_standardise=True,
                 exp_rand_tree_size=True,
                 model_type='rl',
                 Cs=None,
                 cv=3,
                 random_state=None,
                 simple_rules=False):
        self.tree_generator = tree_generator
        self.rfmode = rfmode
        self.lin_trim_quantile = lin_trim_quantile
        self.lin_standardise = lin_standardise
        self.friedscale = FriedScale(trim_quantile=lin_trim_quantile)
        self.exp_rand_tree_size = exp_rand_tree_size
        self.max_rules = max_rules
        self.sample_fract = sample_fract
        self.memory_par = memory_par
        self.tree_size = tree_size
        self.random_state = random_state
        self.model_type = model_type
        self.cv = cv
        self.Cs = Cs
        self.simple_rules = simple_rules  # TODO mettre en param global, on veut pouvoir faire dans le fit pour garder les mêmes nodes et comparaison plus facile du coup

    def fit(self, X, y=None, feature_names=None):
        """Fit and estimate linear combination of rule ensemble

        """
        ## Enumerate features if feature names not provided
        N = X.shape[0]
        if feature_names is None:
            self.feature_names = [
                'feature_' + str(x) for x in range(0, X.shape[1])
            ]
        else:
            self.feature_names = feature_names
        if 'r' in self.model_type:
            ## initialise tree generator
            if self.tree_generator is None:
                n_estimators_default = int(
                    np.ceil(self.max_rules / self.tree_size))
                self.sample_fract_ = min(0.5, (100 + 6 * np.sqrt(N)) / N)
                if self.rfmode == 'regress':
                    self.tree_generator = GradientBoostingRegressor(
                        n_estimators=n_estimators_default,
                        max_leaf_nodes=self.tree_size,
                        learning_rate=self.memory_par,
                        subsample=self.sample_fract_,
                        random_state=self.random_state,
                        max_depth=100)
                else:
                    self.tree_generator = GradientBoostingClassifier(
                        n_estimators=n_estimators_default,
                        max_leaf_nodes=self.tree_size,
                        learning_rate=self.memory_par,
                        subsample=self.sample_fract_,
                        random_state=self.random_state,
                        max_depth=100)

            if self.rfmode == 'regress':
                if type(self.tree_generator) not in [
                        GradientBoostingRegressor, RandomForestRegressor
                ]:
                    raise ValueError(
                        "RuleFit only works with RandomForest and BoostingRegressor"
                    )
            else:
                if type(self.tree_generator) not in [
                        GradientBoostingClassifier, RandomForestClassifier
                ]:
                    raise ValueError(
                        "RuleFit only works with RandomForest and BoostingClassifier"
                    )

            ## fit tree generator
            if not self.exp_rand_tree_size:  # simply fit with constant tree size
                self.tree_generator.fit(X, y)
            else:  # randomise tree size as per Friedman 2005 Sec 3.3
                np.random.seed(self.random_state)
                tree_sizes = np.random.exponential(
                    scale=self.tree_size - 2,
                    size=int(np.ceil(self.max_rules * 2 / self.tree_size)))
                tree_sizes = np.asarray([
                    2 + np.floor(tree_sizes[i_])
                    for i_ in np.arange(len(tree_sizes))
                ],
                                        dtype=int)
                i = int(len(tree_sizes) / 4)
                while np.sum(tree_sizes[0:i]) < self.max_rules:
                    i = i + 1
                tree_sizes = tree_sizes[0:i]
                self.tree_generator.set_params(warm_start=True)
                curr_est_ = 0
                for i_size in np.arange(len(tree_sizes)):
                    size = tree_sizes[i_size]
                    self.tree_generator.set_params(n_estimators=curr_est_ + 1)
                    self.tree_generator.set_params(max_leaf_nodes=size)
                    random_state_add = self.random_state if self.random_state else 0
                    self.tree_generator.set_params(
                        random_state=i_size + random_state_add
                    )  # warm_state=True seems to reset random_state, such that the trees are highly correlated, unless we manually change the random_sate here.
                    self.tree_generator.get_params()['n_estimators']
                    self.tree_generator.fit(np.copy(X, order='C'),
                                            np.copy(y, order='C'))
                    curr_est_ = curr_est_ + 1
                self.tree_generator.set_params(warm_start=False)
            tree_list = self.tree_generator.estimators_
            if isinstance(self.tree_generator,
                          RandomForestRegressor) or isinstance(
                              self.tree_generator, RandomForestClassifier):
                tree_list = [[x] for x in self.tree_generator.estimators_]

            ## extract rules
            self.rule_ensemble = RuleEnsemble(tree_list=tree_list,
                                              feature_names=self.feature_names)

            ## concatenate original features and rules
            X_rules = self.rule_ensemble.transform(
                X, weigh_rules=self.simple_rules)
            self.X_rules = X_rules

            #if self.simple_rules:
            #    for i in range(0, X_rules.shape[1]):
            #        X_rules[:, i] = X_rules[:, i]/len(self.rule_ensemble.rules[i].conditions)

        ## standardise linear variables if requested (for regression model only)
        if 'l' in self.model_type:
            if self.lin_standardise:
                self.friedscale.train(X)
                X_regn = self.friedscale.scale(X)
            else:
                X_regn = X.copy()

        ## Compile Training data
        X_concat = np.zeros([X.shape[0], 0])
        if 'l' in self.model_type:
            X_concat = np.concatenate((X_concat, X_regn), axis=1)
        if 'r' in self.model_type:
            if X_rules.shape[0] > 0:
                X_concat = np.concatenate((X_concat, X_rules), axis=1)
        self.X_concat = X_concat

        ## fit Lasso
        if self.rfmode == 'regress':
            if self.Cs is None:  # use defaultshasattr(self.Cs, "__len__"):
                n_alphas = 100
                alphas = None
            elif hasattr(self.Cs, "__len__"):
                n_alphas = None
                alphas = 1. / self.Cs
            else:
                n_alphas = self.Cs
                alphas = None
            self.lscv = LassoCV(n_alphas=n_alphas,
                                alphas=alphas,
                                cv=self.cv,
                                random_state=self.random_state)
            self.lscv.fit(X_concat, y)
            self.coef_ = self.lscv.coef_
            self.intercept_ = self.lscv.intercept_
        else:
            Cs = 10 if self.Cs is None else self.Cs
            self.lscv = LogisticRegressionCV(Cs=Cs,
                                             cv=self.cv,
                                             penalty='l1',
                                             random_state=self.random_state,
                                             solver='liblinear')
            self.lscv.fit(X_concat, y)
            self.coef_ = self.lscv.coef_[0]
            self.intercept_ = self.lscv.intercept_[0]

        return self

    def predict(self, X):
        """Predict outcome for X

        """
        X_concat = np.zeros([X.shape[0], 0])
        if 'l' in self.model_type:
            if self.lin_standardise:
                X_concat = np.concatenate((X_concat, self.friedscale.scale(X)),
                                          axis=1)
            else:
                X_concat = np.concatenate((X_concat, X), axis=1)
        if 'r' in self.model_type:
            rule_coefs = self.coef_[-len(
                self.rule_ensemble.rules
            ):]  # bug correction. upstreamed at https://github.com/christophM/rulefit/issues/23
            if len(rule_coefs) > 0:
                X_rules = self.rule_ensemble.transform(
                    X, coefs=rule_coefs, weigh_rules=self.simple_rules)
                if X_rules.shape[0] > 0:
                    X_concat = np.concatenate((X_concat, X_rules), axis=1)
        return self.lscv.predict(X_concat)

    def transform(self, X=None, y=None):
        """Transform dataset.

        Parameters
        ----------
        X : array-like matrix, shape=(n_samples, n_features)
            Input data to be transformed. Use ``dtype=np.float32`` for maximum
            efficiency.

        Returns
        -------
        X_transformed: matrix, shape=(n_samples, n_out)
            Transformed data set
        """
        return self.rule_ensemble.transform(X)

    def get_rules(self, exclude_zero_coef=False):
        """Return the estimated rules

        Parameters
        ----------
        exclude_zero_coef: If True (default), returns only the rules with an estimated
                           coefficient not equalt to  zero.

        Returns
        -------
        rules: pandas.DataFrame with the rules. Column 'rule' describes the rule, 'coef' holds
               the coefficients and 'support' the support of the rule in the training
               data set (X)
        """

        n_features = len(self.coef_) - len(self.rule_ensemble.rules)
        rule_ensemble = list(self.rule_ensemble.rules)
        output_rules = []
        ## Add coefficients for linear effects
        for i in range(0, n_features):
            if self.lin_standardise:
                coef = self.coef_[i] * self.friedscale.scale_multipliers[i]
            else:
                coef = self.coef_[i]
            output_rules += [(self.feature_names[i], 'linear', coef, 1, 0)
                             ]  # TODO REMOVE, pour debug
        ## Add rules
        for i in range(0, len(self.rule_ensemble.rules)):
            rule = rule_ensemble[i]
            coef = self.coef_[i + n_features]
            output_rules += [(rule.__str__(), 'rule', coef, rule.support, i)
                             ]  # TODO REMOVE, pour debug
        rules = pd.DataFrame(
            output_rules,
            columns=["rule", "type", "coef", "support",
                     "rule_number"])  # TODO REMOVE, pour debug
        if exclude_zero_coef:
            rules = rules.ix[rules.coef != 0]
        return rules

    def rules_complexity(self):
        n_features = len(self.coef_) - len(self.rule_ensemble.rules)
        rule_ensemble = list(self.rule_ensemble.rules)

        res = 0
        for i in range(0, len(self.rule_ensemble.rules)):
            rule = rule_ensemble[i]
            coef = self.coef_[i + n_features]
            if (coef != 0):
                res += len(rule.conditions)

        return res
Beispiel #31
0
    pickle.dump(feature_map, f, pickle.HIGHEST_PROTOCOL)
del feature_map

label_train = ds_train['click']
label_valid = ds_valid['click']
ds_train = ds_train.drop(['click'], axis=1).values
ds_valid = ds_valid.drop(['click'], axis=1).values

print('build gbdt model ...')
gbdt = GradientBoostingClassifier(loss='deviance',
                                  n_estimators=1000,
                                  learning_rate=0.1,
                                  max_depth=10,
                                  subsample=0.8,
                                  min_samples_split=2000,
                                  min_samples_leaf=1000,
                                  random_state=0,
                                  verbose=1,
                                  warm_start=True)

for i in range(200):
    gbdt.set_params(n_estimators=(i + 1) * 100)
    print('fit model ...', i)
    gbdt.fit(ds_train, label_train)

    print('predict...')
    proba = gbdt.predict_proba(ds_valid)
    print('valid score', log_loss(label_valid, proba))
    print('dump model to output')
    joblib.dump(gbdt, '/output/gbdt' + str(i) + '.pkl')
Beispiel #32
0
class Model:
    def __init__(self, datainfo, timeinfo):
        '''
        This constructor is supposed to initialize data members.
        Use triple quotes for function documentation. 
        '''
        # Just print some info from the datainfo variable
        print("The Budget for this data set is: %d seconds" %
              datainfo['time_budget'])

        print(
            "Loaded %d time features, %d numerical Features, %d categorical features and %d multi valued categorical variables"
            % (datainfo['loaded_feat_types'][0],
               datainfo['loaded_feat_types'][1],
               datainfo['loaded_feat_types'][2],
               datainfo['loaded_feat_types'][3]))
        overall_spenttime = time.time() - timeinfo[0]
        dataset_spenttime = time.time() - timeinfo[1]
        print("[***] Overall time spent %5.2f sec" % overall_spenttime)
        print("[***] Dataset time spent %5.2f sec" % dataset_spenttime)
        self.num_train_samples = 0
        self.num_feat = 1
        self.num_labels = 1
        self.is_trained = False
        #self.clf=svm.SVC()
        #self.clf = SGDClassifier(loss="hinge", penalty="l2")
        #self.clf = linear_model.SGDClassifier()
        self.clf = GradientBoostingClassifier(n_estimators=5,
                                              verbose=1,
                                              random_state=1,
                                              min_samples_split=10,
                                              warm_start=False)
        # Here you may have parameters and hyper-parameters

    def fit(self, F, y, datainfo, timeinfo):
        '''
        This function should train the model parameters.
        Here we do nothing in this example...
        Args:
            X: Training data matrix of dim num_train_samples * num_feat.
            y: Training label matrix of dim num_train_samples * num_labels.
        Both inputs are numpy arrays.
        If fit is called multiple times on incremental data (train, test1, test2, etc.)
        you should warm-start your training from the pre-trained model. Past data will
        NOT be available for re-training.
        '''
        # get the raw categorical and categorical multivalued variables in case you want to process them, in this baseline we simply ignore them
        MV = F['MV']
        CAT = F['CAT']

        # only get numerical variables
        X = F['numerical']

        overall_spenttime = time.time() - timeinfo[0]
        dataset_spenttime = time.time() - timeinfo[1]

        print("[***] Overall time spent %5.2f sec" % overall_spenttime)
        print("[***] Dataset time spent %5.2f sec" % dataset_spenttime)

        # get numerical variables, concatenate them with categorical variables
        # catnumeric_dataset=np.array(CAT)
        # X= np.concatenate((F['numerical'],catnumeric_dataset),axis=1).astype(np.float64).copy(order='C')

        # convert NaN to zeros
        X = data_converter.replace_missing(X)
        #print "This batch of data has: "
        self.num_train_samples = X.shape[0]
        if X.ndim > 1: self.num_feat = X.shape[1]
        #print("FIT: dim(X)= [{:d}, {:d}]").format(self.num_train_samples, self.num_feat)
        num_train_samples = y.shape[0]
        if y.ndim > 1: self.num_labels = y.shape[1]
        #print("FIT: dim(y)= [{:d}, {:d}]").format(num_train_samples, self.num_labels)
        # subsample the data for efficient processing
        removeperc = 0.9
        if removeperc > 0:
            rem_samples = int(num_train_samples * removeperc)
            skip = sorted(
                random.sample(range(num_train_samples),
                              num_train_samples - rem_samples))
            num_train_samples = num_train_samples - rem_samples

            X = X[skip, :]
            y = y[skip, :]
            self.num_train_samples = X.shape[0]

        if self.is_trained:
            _ = self.clf.set_params(n_estimators=self.clf.n_estimators + 1,
                                    warm_start=True)
            self.DataX = X
            self.DataY = y
        else:
            self.DataX = X
            self.DataY = y
        print("The whole available data is: ")
        print(
            ("Real-FIT: dim(X)= [{:d}, {:d}]").format(self.DataX.shape[0],
                                                      self.DataX.shape[1]))
        print(
            ("Real-FIT: dim(y)= [{:d}, {:d}]").format(self.DataY.shape[0],
                                                      self.num_labels))
        #print "fitting with ..."
        #print self.clf.n_estimators
        self.clf.fit(self.DataX, np.ravel(self.DataY))

        #print "Model fitted.."
        if (self.num_train_samples != num_train_samples):
            print("ARRGH: number of samples in X and y do not match!")
        self.is_trained = True

    def predict(self, F, datainfo, timeinfo):
        '''
        This function should provide predictions of labels on (test) data.
        Here we just return random values...
        Make sure that the predicted values are in the correct format for the scoring
        metric. For example, binary classification problems often expect predictions
        in the form of a discriminant value (if the area under the ROC curve it the metric)
        rather that predictions of the class labels themselves. 
        The function predict eventually casdn return probabilities or continuous values.
        '''
        # get the raw categorical multivalued variables in case you want to process them, in this baseline we simply ignore them
        MV = F['MV']
        CAT = F['CAT']

        overall_spenttime = time.time() - timeinfo[0]
        dataset_spenttime = time.time() - timeinfo[1]

        print("[***] Overall time spent %5.2f sec" % overall_spenttime)
        print("[***] Dataset time spent %5.2f sec" % dataset_spenttime)

        # only get numerical variables
        X = F['numerical']

        # get numerical variables, concatenate them with categorical variables
        # catnumeric_dataset=np.array(CAT)
        # X= np.concatenate((F['numerical'],catnumeric_dataset),axis=1).astype(np.float64).copy(order='C')

        # convert NaN to zeros
        X = data_converter.replace_missing(X)

        num_test_samples = X.shape[0]
        if X.ndim > 1: num_feat = X.shape[1]
        print(("PREDICT: dim(X)= [{:d}, {:d}]").format(num_test_samples,
                                                       num_feat))
        if (self.num_feat != num_feat):
            print(
                "ARRGH: number of features in X does not match training data!")
        print(("PREDICT: dim(y)= [{:d}, {:d}]").format(num_test_samples,
                                                       self.num_labels))
        y = self.clf.decision_function(X)
        y = np.transpose(y)
        return y

    def save(self, path="./"):
        pickle.dump(self, open(path + '_model.pickle', "w"))

    def load(self, path="./"):
        modelfile = path + '_model.pickle'
        if isfile(modelfile):
            with open(modelfile) as f:
                self = pickle.load(f)
            print("Model reloaded from: " + modelfile)
        return self
def gbdt_plus_liner_classifier_grid_search(stack_setting_,
                                           upper_param_keys=None, upper_param_vals=None,
                                           lower_param_keys=None, lower_param_vals=None,
                                           num_proc=None):

    """
     upper model is GBDT or Random Forest
     lower model is Linear Classifier
    """
    if stack_setting_ is None:
        sys.stderr.write('You have no setting Json file\n')
        sys.exit()

    if num_proc is None:
        num_proc = 6


    # 1. upper model
    if upper_param_keys is None:
        upper_param_keys = ['model_type', 'n_estimators', 'loss', 'random_state', 'subsample', 'max_features', 'max_leaf_nodes', 'learning_rate', 'max_depth', 'min_samples_leaf']

    if upper_param_vals is None:
        upper_param_vals = [[GradientBoostingClassifier], [100], ['deviance'], [0], [0.1], [5], [20], [0.1], [2], [8]]


    # grid search for upper model : GBDT or Random Forest
    # ExperimentL1 has model free. On the other hand, data is fix
    exp = ExperimentL1(data_folder = stack_setting_['0-Level']['folder'],
                       train_fname = stack_setting_['0-Level']['train'], 
                       test_fname = stack_setting_['0-Level']['test'])

    # GridSearch has a single model. model is dertermined by param
    #gs = GridSearch(SklearnModel, exp, upper_param_keys, upper_param_vals,
    #                cv_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['cv']['folder'],
    #                cv_out = stack_setting_['1-Level']['gbdt_linear']['upper']['cv']['cv_out'], 
    #                cv_pred_out = stack_setting_['1-Level']['gbdt_linear']['upper']['cv']['cv_pred_out'], 
    #                refit_pred_out = stack_setting_['1-Level']['gbdt_linear']['upper']['cv']['refit_pred_out'])
    #upper_best_param, upper_best_score = gs.search_by_cv()


    model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder']
    model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train']
    model_train_fname = os.path.join(Config.get_string('data.path'), 
                                     model_folder, 
                                     model_train_fname)
    model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder']
    model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test']
    model_test_fname = os.path.join(Config.get_string('data.path'), 
                                    model_folder, 
                                    model_test_fname)
    upper_param_dict = dict(zip(upper_param_keys, upper_param_vals))
    if os.path.isfile(model_train_fname) is False and \
            os.path.isfile(model_test_fname) is False:
        #upper_param_dict['model_type'] == [GradientBoostingClassifier]
        del upper_param_dict['model_type']        
        clf = GradientBoostingClassifier()
        clf_cv = GridSearchCV(clf, upper_param_dict, 
                              verbose = 10, 
                              scoring = "f1",#scoring = "precision" or "recall"
                              n_jobs = num_proc, cv = 5)
        
        X_train, y_train = exp.get_train_data()
        clf_cv.fit(X_train, y_train)
        upper_best_params = clf_cv.best_params_
        print upper_best_params
        del clf_cv
        clf.set_params(**upper_best_params)
        clf.fit(X_train, y_train)
        train_loss = clf.train_score_
        test_loss = np.empty(len(clf.estimators_))
        X_test, y_test = exp.get_test_data()
        for i, pred in enumerate(clf.staged_predict(X_test)):
            test_loss[i] = clf.loss_(y_test, pred)

        graph_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['graph']['folder']
        graph_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['graph']['name']
        graph_fname = os.path.join(Config.get_string('data.path'), 
                                   graph_folder, 
                                   graph_fname)
        gs = GridSpec(2,2)
        ax1 = plt.subplot(gs[0,1])
        ax2 = plt.subplot(gs[1,1])
        ax3 = plt.subplot(gs[:,0])

        ax1.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test')
        ax1.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train')
        ax1.set_xlabel('the number of weak learner:Boosting Iterations')
        ax1.set_ylabel('%s Loss' % (upper_best_params.get('loss','RMSE')))
        ax1.legend(loc="best")       

        # dump for the transformated feature
        clf = TreeTransform(GradientBoostingClassifier(),
                            best_params_ = upper_best_params)
        if type(X_train) == pd.core.frame.DataFrame:
            clf.fit(X_train.as_matrix().astype(np.float32), y_train)
        elif X_train == np.ndarray:
            clf.fit(X_train.astype(np.float32), y_train)

        # train result
        train_loss = clf.estimator_.train_score_
        test_loss = np.zeros((len(clf.estimator_.train_score_),), dtype=np.float32)

        if type(X_train) == pd.core.frame.DataFrame:
            for iter, y_pred in enumerate(clf.estimator_.staged_decision_function(X_test.as_matrix().astype(np.float32))):
                test_loss[iter] = clf.estimator_.loss_(y_test, y_pred)
        elif type(X_train) == np.ndarray:
            for iter, y_pred in enumerate(clf.estimator_.staged_decision_function(X_test.astype(np.float32))):
                test_loss[iter] = clf.estimator_.loss_(y_test, y_pred)
        ax2.plot(train_loss, label="train_loss")
        ax2.plot(test_loss, label="test_loss")
        ax2.set_xlabel('Boosting Iterations')
        ax2.set_ylabel('%s Loss' % (upper_best_params.get('loss','RMSE')))
        ax2.legend(loc="best")

        # tree ensambles
        score_threshold=0.8
        index2feature = dict(zip(np.arange(len(X_train.columns.values)), X_train.columns.values))
        feature_importances_index = [str(j) for j in clf.estimator_.feature_importances_.argsort()[::-1]]
        feature_importances_score = [clf.estimator_.feature_importances_[int(j)] for j in feature_importances_index]
        fis = pd.DataFrame(
            {'name':[index2feature.get(int(key),'Null') for key in feature_importances_index],
             'score':feature_importances_score}
            )
        score_threshold = fis['score'][fis['score'] > 0.0].quantile(score_threshold)
        # where_str = 'score > %f & score > %f' % (score_threshold, 0.0)
        where_str = 'score >= %f' % (score_threshold)
        fis = fis.query(where_str)
        sns.barplot(x = 'score', y = 'name',
                    data = fis,
                    ax=ax3,
                    color="blue")
        ax3.set_xlabel("Feature_Importance", fontsize=10)
        plt.tight_layout()
        plt.savefig(graph_fname)
        plt.close()

        #print clf.toarray().shape
        # >(26049, 100)
        #input_features = 26049, weak_learners = 100
        #print len(one_hot.toarray()[:,0]), one_hot.toarray()[:,0]
        #print len(one_hot.toarray()[0,:]), one_hot.toarray()[0,:]

        ## feature transformation : get test data from train trees
        #print transformated_train_features.shape, X_train.shape
        #print transformated_test_features.shape, X_test.shape

        transformated_train_features = clf.one_hot_encoding
        if type(X_test) == pd.core.frame.DataFrame:
            transformated_test_features = clf.transform(X_test.as_matrix().astype(np.float32), 
                                                        y_test)
        elif type(X_train) == np.ndarray:
            transformated_test_features = clf.transform(X_test, y_test)

        #model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder']
        #model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train']
        #model_train_fname = os.path.join(Config.get_string('data.path'), 
        #                                 model_folder, 
        #                                 model_train_fname)
        with gzip.open(model_train_fname, "wb") as gf:
            cPickle.dump([transformated_train_features, y_train], 
                         gf,
                         cPickle.HIGHEST_PROTOCOL)

        #model_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder']
        #model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test']
        #model_test_fname = os.path.join(Config.get_string('data.path'), 
        #                                model_folder, 
        #                                model_test_fname)
        with gzip.open(model_test_fname, "wb") as gf:
            cPickle.dump([transformated_test_features, y_test],
                         gf,
                         cPickle.HIGHEST_PROTOCOL)


    """
    # 2. lower model
    if lower_param_keys is None:
        lower_param_keys = ['model_type', 'n_neighbors', 'weights',
                            'algorithm', 'leaf_size', 'metric', 'p', 'n_jobs']

    if lower_param_vals is None:
        lower_param_vals = [[KNeighborsClassifier], [1, 2, 4, 8, 16, 24, 32, 64], ['uniform', 'distance'],
                            ['ball_tree'], [30], ['minkowski'], [2], [4]]

    lower_param_dict = dict(zip(lower_param_keys, lower_param_vals))
    if lower_param_dict['model_type'] == [LogisticRegression]:

        # grid search for lower model : Linear Classifier
        # ExperimentL1_1 has model free. On the other hand, data is fix
        model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train']
        model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test']
        exp = ExperimentL1_1(data_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'],
                             train_fname = model_train_fname, 
                             test_fname = model_test_fname)
        # GridSearch has a single model. model is dertermined by param
        gs = GridSearch(SklearnModel, exp, lower_param_keys, lower_param_vals,
                        cv_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['folder'],
                        cv_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_out'], 
                        cv_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_pred_out'], 
                        refit_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['refit_pred_out'])
        lower_best_param, lower_best_score = gs.search_by_cv()
        print lower_best_param
    

        # get meta_feature
        exp.write2csv_meta_feature(
            model = LogisticRegression(),
            meta_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['folder'],
            meta_train_fname = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['train'],
            meta_test_fname = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['test'],
            meta_header = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['header'],
            best_param_ = lower_best_param
            )
    """

    # 2. lower model
    if lower_param_keys is None:
        lower_param_keys = ['model_type', 'n_neighbors', 'weights',
                            'algorithm', 'leaf_size', 'metric', 'p', 'n_jobs']

    if lower_param_vals is None:
        lower_param_vals = [[KNeighborsClassifier], [1, 2, 4, 8, 16, 24, 32, 64], ['uniform', 'distance'],
                            ['ball_tree'], [30], ['minkowski'], [2], [4]]

    lower_param_dict = dict(zip(lower_param_keys, lower_param_vals))
    clf_lower_model = None
    clf_lower_mname = None

    # grid search for lower model : Linear Classifier
    # ExperimentL1_1 has model free. On the other hand, data is fix
    if lower_param_dict['model_type'] == [LogisticRegression]:
        # Logistic Regression
        clf_lower_model = LogisticRegression()
        clf_lower_mname = 'LR'

    elif lower_param_dict['model_type'] == [SVM]:
        # SVM
        clf_lower_model = LinearSVC()
        clf_lower_mname = 'SVM'

    else:
        sys.stderr.write("You should input lower liner model\n")
        sys.exit()

    model_train_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['train']
    model_test_fname = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['test']
    exp = ExperimentL1_1(data_folder = stack_setting_['1-Level']['gbdt_linear']['upper']['gbdt']['folder'],
                         train_fname = model_train_fname, 
                         test_fname = model_test_fname)
    # GridSearch has a single model. model is dertermined by param
    gs = GridSearch(SklearnModel, exp, lower_param_keys, lower_param_vals,
                    cv_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['folder'],
                    cv_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_out'], 
                    cv_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['cv_pred_out'], 
                    refit_pred_out = stack_setting_['1-Level']['gbdt_linear']['lower']['cv']['refit_pred_out'])
    lower_best_param, lower_best_score = gs.search_by_cv()
    print lower_best_param

    # get meta_feature
    meta_train_fname_ = "%s_%s.%s" % (
        ".".join(stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['train'].split(".")[:-1]),
        clf_lower_mname,
        stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['train'].split(".")[-1]
        )
    meta_test_fname_ = "%s_%s.%s" % (
        ".".join(stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['test'].split(".")[:-1]),
        clf_lower_mname,
        stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['test'].split(".")[-1]
        )
    exp.write2csv_meta_feature(
        model = clf_lower_model,
        meta_folder = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['folder'],
        meta_train_fname = meta_train_fname_,
        meta_test_fname = meta_test_fname_,
        meta_header = stack_setting_['1-Level']['gbdt_linear']['lower']['meta_feature']['header'],
        best_param_ = lower_best_param
        )

    ## best parameter for GBDT and anohter sklearn classifier
    #return best_param, best_score
    return upper_best_params, lower_best_param
        # estimator : ensemble学習器
=======
        # estimator : ensemble蟄ヲ鄙貞勣
>>>>>>> a476ecf10868a68d67e3d992fef72bc4370722a8

        # cv : if train : get best parameter
        if phase == "train":
            clf = GradientBoostingClassifier()
            gscv = GridSearchCV(clf, parameters, 
                                verbose = 10, 
                                scoring = "f1",#scoring = "precision" or "recall"
                                n_jobs = n_jobs, cv = cv_k_fold)
            gscv.fit(X_train, y_train)
            self.best_params = gscv.best_params_
            
            clf.set_params(**gscv.best_params_)
            clf.fit(X_train, y_train)
            train_loss = clf.train_score_
            test_loss = np.empty(len(clf.estimators_))
            for i, pred in enumerate(clf.staged_predict(X_test)):
                test_loss[i] = clf.loss_(y_test, pred)
            plt.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test')
            plt.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train')
            plt.xlabel('the number of weak learner:Boosting Iterations')
            plt.ylabel('Loss')
            plt.legend(loc="best")
            plt.savefig("loss_cv.png")
            plt.close()

        estimator.set_params(**gscv.best_params_)
        self.estimator = estimator
Beispiel #35
0
def main(trainXFile='/home/kiran/kdd/trainXall.csv',
         trainYFile='/home/kiran/kdd/trainY.csv',
         validXFile='/home/kiran/kdd/validXall.csv',
         validYFile='/home/kiran/kdd/validY.csv',
         testXFile='/home/kiran/kdd/testXall.csv',
         n_estimators=150,
         n_estimators_step=150,
         learning_rate=0.005,
         max_features=30,
         max_depth=11,
         verbose=0,
         dump_file='/home/kiran/kdd/pymodels/gbm_all_0.005_30_11.pkl',
         outputFile='prediction.txt',
         max_trees=300,
         random_state=11):
    actual = np.loadtxt(validYFile, delimiter=",")
    trainY = np.loadtxt(trainYFile, delimiter=",")
    train = pd.read_csv(trainXFile)
    valid = pd.read_csv(validXFile)
    #trainY = pd.read_csv (trainYFile)
    #trainY = trainY.ix [:,'x']
    #validY = pd.read_csv (validYFile)
    #actual = validY.ix [:,'x']
    #validWeights = pd.read_csv (validFileWeights)
    #validWeights = validWeights.ix [:,'x']
    #actual = actual.get_values ()

    gbm = GradientBoostingClassifier(n_estimators=n_estimators,
                                     learning_rate=learning_rate,
                                     max_features=max_features,
                                     max_depth=max_depth,
                                     random_state=random_state,
                                     verbose=verbose)
    gbm.fit(train, trainY)
    prediction_valid = gbm.predict_proba(valid)[:, 1]

    #gbm = joblib.load ( '/home/kiran/kdd/pymodels/gbmmore.pkl')
    bestAUC = 0
    #myAUC = kdd_metrics (actual, prediction_valid, validWeights)
    myAUC = metrics.roc_auc_score(actual, prediction_valid)
    bestAUC = 0

    while myAUC >= bestAUC:
        n_estimators = n_estimators + n_estimators_step
        gbm.set_params(n_estimators=n_estimators, warm_start=True)
        gbm.fit(train, trainY)
        prediction_valid = gbm.predict_proba(valid)[:, 1]
        myAUC = metrics.roc_auc_score(actual, prediction_valid)
        print "bestAUC: %f myAUC: %f" % (bestAUC, myAUC)
        improvement = myAUC - bestAUC
        if improvement < 0.0000000001:
            break
        if n_estimators > max_trees:
            break
        bestAUC = myAUC
        bestPrediction = prediction_valid
        joblib.dump(gbm, dump_file)
        print "bestAUC: %f improvement: %f" % (bestAUC, improvement)

    myAUC = metrics.roc_auc_score(actual, bestPrediction)
    print "AUC: %f bestPrediction: %f" % (myAUC, improvement)
    test = pd.read_csv(testXFile)
    prediction_test1 = gbm.predict_proba(test)[:, 1]
    prediction_total = np.concatenate((bestPrediction, prediction_test1),
                                      axis=0)
    np.savetxt(outputFile, prediction_total, delimiter=",")
                                subsample=0.8, 
                                random_state=42
                                )
parameter_grid = {
                    'learning_rate': np.arange(0.001, 0.003, 0.0005),
                    'n_estimators': np.arange(1000, 3000, 500)
                    }
grid_gradient = GridSearchCV(gb, parameter_grid, cv=cv_splitter, n_jobs=-1)
grid_gradient.fit(X_1, y)

gradient_best_param = grid_gradient.best_params_
gradient_best_param
# best parameter values to be used in the stack model

# update gb with the optimal parameters
gb.set_params(**gradient_best_param)

# #### 2. Tune max_depth and min_sample_split

parameter_grid = {
                    'max_depth': np.arange(1, 5),
                    'min_samples_split': np.arange(2, 6, 1)
                    }
grid_gradient = GridSearchCV(gb, parameter_grid, cv=cv_splitter, n_jobs=-1)
grid_gradient.fit(X_1, y)

grid_gradient.best_params_

gradient_best_param.update(grid_gradient.best_params_)
gradient_best_param
# update best parameter values to be used in the stack model
Beispiel #37
0
              arrowprops=dict(arrowstyle="<->"))
ax.text(810, 0.25, 'train-test gap')



# regularization
def fmt_params(params):
    return ", ".join("{0}={1}".format(key, val) for key, val in params.items())

fig = plt.figure(figsize=(8, 5))
ax = plt.gca()
for params, (test_color, train_color) in [({}, ('#d7191c', '#2c7bb6')),
                                      ({'min_samples_leaf': 3},
                                       ('#fdae61', '#abd9e9'))]:
    est = GradientBoostingRegressor(n_estimators=n_estimators, max_depth=1, learning_rate=1.0)
    est.set_params(**params)
    est.fit(X_train, y_train)

    test_dev, ax = deviance_plot(est, X_test, y_test, ax=ax, label=fmt_params(params),
                             train_color=train_color, test_color=test_color)

ax.annotate('Higher bias', xy=(900, est.train_score_[899]), xycoords='data',
        xytext=(600, 0.3), textcoords='data',
        arrowprops=dict(arrowstyle="->", connectionstyle="arc"),
        )
ax.annotate('Lower variance', xy=(900, test_dev[899]), xycoords='data',
        xytext=(600, 0.4), textcoords='data',
        arrowprops=dict(arrowstyle="->", connectionstyle="arc"),
        )
plt.legend(loc='upper right')
Beispiel #38
0
train_data = data.loc[rindex,:]
test_data = data.drop(rindex)
train_label = train_data.target
train_data = train_data.drop(['target', 'id'], axis=1)
test_label = test_data.target
test_data = test_data.drop(['target', 'id'], axis=1)

gb_otto = GradientBoostingClassifier(n_estimators = 100, verbose = 1, warm_start = 1)
gb_otto.fit(train_data, train_label)
test_prob = gb_otto.predict_proba(test_data)
train_prob = gb_otto.predict_proba(train_data)
print 'The logloss score of test data:', logloss(test_label, test_prob)
print 'The logloss score of train data:', logloss(train_label, train_prob)

gb_otto.set_params(n_estimators = 150)
gb_otto.fit(train_data, train_label)
test_prob = gb_otto.predict_proba(test_data)
train_prob = gb_otto.predict_proba(train_data)
print 'The logloss score of test data:', logloss(test_label, test_prob)
print 'The logloss score of train data:', logloss(train_label, train_prob)

gb_otto.set_params(n_estimators = 200)
gb_otto.fit(train_data, train_label)
test_prob = gb_otto.predict_proba(test_data)
train_prob = gb_otto.predict_proba(train_data)
print 'The logloss score of test data:', logloss(test_label, test_prob)
print 'The logloss score of train data:', logloss(train_label, train_prob)


gb_otto.set_params(n_estimators = 250)
train_coup3_y = train_df_coup3_y.values
test_coup3_X = test_df_coup3.values
std= StandardScaler()
train_coup3_X= std.fit_transform(train_coup3_X)
test_coup3_X= std.fit_transform(test_coup3_X)
X_train14,X_test14,y_train14,y_test14 = train_test_split(train_coup3_X,train_coup3_y,test_size = 0.20)
gbc14 = GradientBoostingClassifier(n_estimators=3000)
param_grid14 = {'max_depth': [3,4,6],#tree depths
              'min_samples_leaf': [5,9,12], #no. of samples to be at leaf nodes
              'learning_rate': [0.1,0.01,0.05,0.001]## Shrinkage
              #'max_features': [1.0, 0.3] #no.of features before finding best split node #stochastic gradient 
              }
#loss
gs_cv14 = GridSearchCV(gbc14, param_grid14, cv=3,scoring='accuracy',n_jobs=-1).fit(X_train14, y_train14)
print('Best hyperparameters: %r' % gs_cv14.best_params_)
gbc14.set_params(**gs_cv14.best_params_)
gbc14.fit(X_train14,y_train14)

#Models Ensambling

def ensambling(X,Y):

    
    # The DEV SET will be used for all training and validation purposes
    # The TEST SET will never be used for training, it is the unseen set.
    dev_cutoff = len(Y) * 4/5
    X_dev = X[:dev_cutoff]
    Y_dev = Y[:dev_cutoff]
    X_test = X[dev_cutoff:]
    Y_test = Y[dev_cutoff:]