Example #1
0
def find_best_xgb_estimator(X, y, cv, param_comb):
    # Random search over specified parameter values for XGBoost.
    # Exhaustive search takes many more cycles w/o much benefit.
    # Returns optimized XGBoost estimator.
    # Ref: https://www.kaggle.com/tilii7/hyperparameter-grid-search-with-xgboost
    print('\n Finding best XGBoost estimator...')
    param_grid = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }
    init_est = xgb(learning_rate=0.02, n_estimators=600, objective='multi:softprob',
        verbose=1, nthread=1)
    random_search = RandomizedSearchCV(estimator=init_est, param_distributions=param_grid,
        n_iter=param_comb, n_jobs=4, iid=False, cv=cv,
        verbose=1, random_state=RANDOM_SEED)
    random_search.fit(X, y)
    #print('\n All results:')
    #print(random_search.cv_results_)
    print('\n Best estimator:')
    print(random_search.best_estimator_)
    print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' %
        (FOLDS, PARA_COMB))
    print(random_search.best_score_ * 2 - 1)
    print('\n Best hyperparameters:')
    print(random_search.best_params_)
    return random_search.best_estimator_
Example #2
0
def xg_boost(f_train, l_train, f_test):
    from xgboost import XGBClassifier as xgb
    clf = xgb(n_estimators=100)
    clf.fit(f_train, l_train)
    pred = clf.predict_proba(f_test)
    #print(pred)
    return pred
    def XGBoost(self, args):  ## Gradient Boosting

        logger.info("Running Gradient Boosting ... ")

        if args.predictor.lower() == 'classifier':
            from xgboost import XGBClassifier as xgb
        elif args.predictor.lower() == 'regressor':
            from xgboost import XGBRegressor as xgb

        xg_regression_model = xgb(objective='binary:logistic',
                                  n_estimator=20000,
                                  colsample_bytree=0.6,
                                  max_depth=6)

        ## Fit the regressor to the training set
        xg_regression_model.fit(self.X_train, self.y_train)

        ## Predict the labels
        self.y_pred = xg_regression_model.predict(self.X_data)
        if args.predictor.lower() == 'regressor':
            self.y_pred = logistic.cdf(self.y_pred)
        self.data['boosting_score'] = self.y_pred
        self.model = xg_regression_model

        return self
def best_model(xt, xv, yt, yv):
	models = []

	name_dt = "DecisionTreeRegressor"
	model_dt = dtr(random_state=1) # decision tree
	model_dt.fit(xt, yt)
	models.append({'name': name_dt, 'model': model_dt, 'mae': get_mae(model_dt, xv, yv)})

	name_rf = "RandomForestRegressor"
	model_rf = rfr(random_state=1) # random forest
	model_rf.fit(xt, yt)
	models.append({'name': name_rf, 'model': model_rf, 'mae': get_mae(model_rf, xv, yv)})

	name_xgb = "XGBRegressor"
	model_xgb = xgb(random_state=1, n_estimators=10000, learning_rate=0.01) # xgboost
	model_xgb.fit(xt, yt, early_stopping_rounds=10, eval_set=[(xv, yv)], verbose=False)
	models.append({'name': name_xgb, 'model': model_xgb, 'mae': get_mae(model_xgb, xv, yv)})
	
	print("\n")
	for m in models:
		print("Model {} has MAE {}".format(m.get('name'), m.get('mae')))

	min_mae = min(i['mae'] for i in models)
	best_model = [m for m in models if m.get('mae') == min_mae]
	print("\nBest model pick: ", best_model[0].get('name'))
	print("\n")

	return best_model[0].get('model')
Example #5
0
def xg_boost():
    global features_train, labels_train, features_test
    from xgboost import XGBClassifier as xgb
    clf = xgb()
    clf.fit(features_train, labels_train)
    pred = clf.predict(features_test)
    return pred
Example #6
0
    def trainXgboost(self, x_train, y_train, user_test_data):
        X_train, x_valid, y_train, y_valid = train_test_split(
            x_train, y_train, test_size=0.2, random_state=4242)

        modelXGB = xgb(max_depth=4, n_estimators=500, learning_rate=0.05)
        modelXGB.fit(X_train, y_train.values.ravel())

        # predictions_probaXGB = modelXGB.predict_proba(x_valid)
        # predictionsXGB = modelXGB.predict(x_valid)
        # predictions = [round(value) for value in predictionsXGB]
        #
        #
        # # predictionsXGB=modelXGB.predict(text)
        # log_loss_score_XGB = log_loss(y_valid, predictions_probaXGB)
        # acc_XGB = accuracy_score(y_valid, predictions)
        # f1_XGB = f1_score(y_valid, predictions)
        #
        # print("XGBoost Classifier ")
        # print('Log loss: %.5f' % log_loss_score_XGB)
        # print('Acc: %.5f' % (acc_XGB * 100.0))
        # print('F1: %.5f' % f1_XGB)

        predictions_test = modelXGB.predict(user_test_data)
        predictions_test_binary = [round(value) for value in predictions_test]

        print('score for test data : ', predictions_test_binary)

        return predictions_test_binary
Example #7
0
def test_meta_classifier():
    print("Start step of classes prediction")
    df_meta = pd.read_csv('meta_added_class.csv')
    X = df_meta.to_numpy()[:, 2:-1]
    y = df_meta.to_numpy()[:, -1].astype(float)
    y_predicted = np.zeros(y.shape[0], dtype =float)
    for i in range(X.shape[0]):
        classifier = xgb()
        map = np.ones(X.shape[0], dtype = bool)
        map[i] = False

        X_all = X[map,:]
        y_all = y[map]
        classifier.fit(X_all, y_all)
        X_to_predict = np.zeros((1,X.shape[1]))
        X_to_predict[0] = X[i] #data set in row i
        y_predicted[i] = classifier.predict(X_to_predict)
    np.savetxt('y_predicted.csv', y_predicted)

    ACC, TPR, FPR, PPV, AUC_roc, AUC_pr = common.test_measurements(y, y_predicted)
    precision, recall, _ = common.precision_recall_curve(y, y_predicted)
    AUC_pr = common.auc(recall, precision)
    meta_results = pd.DataFrame(columns = ['ACC', 'TPR', 'FPR', 'PPV', 'AUC_roc', 'AUC_pr'])
    meta_results.loc[len (meta_results)] = [ACC, TPR, FPR, PPV, AUC_roc, AUC_pr]
    meta_results.to_csv('meta_results.csv')
    print("Finished step of classes prediction")
Example #8
0
def voting_pitchers(to_predict_pitchers, pitcher_predictions, x_pitchers,
                    xgb_pitchers_params, rforest_pitchers_params,
                    logreg_pitchers_params, svm_pitchers_params):
    # pitchers voting classifier using the optimal parameters

    accuracy_list_voting_pitchers = []
    accuracy_list_voting_pitchers_ERA = []
    accuracy_list_voting_pitchers_K = []
    accuracy_list_voting_pitchers_W = []
    accuracy_list_voting_pitchers_WHIP = []

    i = 0
    col_list = ['correct_ERA', 'correct_K', 'correct_W', 'correct_WHIP']
    for col in col_list:
        svm_pitchers_params[i][col]['probability'] = True
        i += 1

    for i in xrange(10):
        j = 0
        for col in to_predict_pitchers:
            y = pitcher_predictions[col].tolist()
            clf1 = xgb(**xgb_pitchers_params[j][col])
            clf2 = RandomForestClassifier(**rforest_pitchers_params[j][col])
            clf3 = linear_model.LogisticRegression(
                **logreg_pitchers_params[j][col])
            #clf5 = QuadraticDiscriminantAnalysis(**qda_pitchers_params[j][col])
            clf4 = svm.SVC(**svm_pitchers_params[j][col])
            eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                                ('gnb', clf3), ('svm', clf4)],
                                    voting='soft')
            #eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft')
            scores = cross_val_score(eclf,
                                     x_pitchers,
                                     y,
                                     cv=5,
                                     scoring='accuracy',
                                     n_jobs=-1)

            acc = scores.mean()
            accuracy_list_voting_pitchers.append(acc)

            if col == 'correct_ERA':
                accuracy_list_voting_pitchers_ERA.append(acc)
            elif col == 'correct_K':
                accuracy_list_voting_pitchers_K.append(acc)
            elif col == 'correct_W':
                accuracy_list_voting_pitchers_W.append(acc)
            elif col == 'correct_WHIP':
                accuracy_list_voting_pitchers_WHIP.append(acc)
            j += 1

    print "%-15s" % 'overall average', np.mean(accuracy_list_voting_pitchers)
    print "%-15s" % 'correct_ERA', np.mean(accuracy_list_voting_pitchers_ERA)
    print "%-15s" % 'correct_K', np.mean(accuracy_list_voting_pitchers_K)
    print "%-15s" % 'correct_W', np.mean(accuracy_list_voting_pitchers_W)
    print "%-15s" % 'correct_WHIP', np.mean(accuracy_list_voting_pitchers_WHIP)
Example #9
0
def tree_model(train_data, train_labels, test_data, test_labels):

    clf = xgb()
    clf.fit(train_data, train_labels)
    preds = clf.predict(test_data)

    print('XGB Accuracy {}'.format(
        (preds == test_labels).sum() / len(test_labels)))

    confusion(preds, test_labels)
Example #10
0
def voting_hitters(to_predict_hitters, hitter_predictions, x_hitters,
                   xgb_hitters_params, rforest_hitters_params,
                   logreg_hitters_params, qda_hitters_params):
    accuracy_list_voting_hitters = []
    accuracy_list_voting_hitters_AVG = []
    accuracy_list_voting_hitters_HR = []
    accuracy_list_voting_hitters_R = []
    accuracy_list_voting_hitters_RBI = []
    accuracy_list_voting_hitters_SB = []

    for i in xrange(10):
        j = 0
        for col in to_predict_hitters:
            y = hitter_predictions[col].tolist()
            clf1 = xgb(**xgb_hitters_params[j][col])
            clf2 = RandomForestClassifier(**rforest_hitters_params[j][col])
            clf3 = linear_model.LogisticRegression(
                **logreg_hitters_params[j][col])
            clf4 = QuadraticDiscriminantAnalysis(**qda_hitters_params[j][col])
            #clf4 = svm.SVC(**svm_hitters_params[j][col])
            eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2),
                                                ('gnb', clf3), ('qda', clf4)],
                                    voting='soft')

            scores = cross_val_score(eclf,
                                     x_hitters,
                                     y,
                                     cv=5,
                                     scoring='accuracy',
                                     n_jobs=-1)

            acc = scores.mean()
            accuracy_list_voting_hitters.append(acc)

            if col == 'correct_AVG':
                accuracy_list_voting_hitters_AVG.append(acc)
            elif col == 'correct_HR':
                accuracy_list_voting_hitters_HR.append(acc)
            elif col == 'correct_R':
                accuracy_list_voting_hitters_R.append(acc)
            elif col == 'correct_RBI':
                accuracy_list_voting_hitters_RBI.append(acc)
            elif col == 'correct_SB':
                accuracy_list_voting_hitters_SB.append(acc)
            j += 1

    print "%-15s" % 'overall average', np.mean(accuracy_list_voting_hitters)
    print "%-15s" % 'correct_AVG', np.mean(accuracy_list_voting_hitters_AVG)
    print "%-15s" % 'correct_HR', np.mean(accuracy_list_voting_hitters_HR)
    print "%-15s" % 'correct_R', np.mean(accuracy_list_voting_hitters_R)
    print "%-15s" % 'correct_RBI', np.mean(accuracy_list_voting_hitters_RBI)
    print "%-15s" % 'correct_SB', np.mean(accuracy_list_voting_hitters_SB)
def train_model(train_x, train_y, model_type):
    model = None
    if model_type is 'XGB':
        model = xgb(max_depth=50,
                    n_estimators=80,
                    learning_rate=0.1,
                    colsample_bytree=.7,
                    gamma=0,
                    reg_alpha=4,
                    objective='binary:logistic',
                    eta=0.3,
                    silent=1,
                    subsample=0.8)
        model.fit(train_x, train_y)
    return model
def xg_hitters_params(to_predict_hitters, x_hitters, hitter_predictions):
	best_params = []

	for col in to_predict_hitters:
	    
	    y = hitter_predictions[col].tolist()
	    x_train, x_test, y_train, y_test = train_test_split(x_hitters, y)
	    
	    xgb_classifier = xgb()
	    parameters = {'max_depth': [3,5,9], 'learning_rate': [.1,.4], "n_estimators": [250,350], \
	                'reg_lambda': [1,4]}
	    clf = GridSearchCV(xgb_classifier, parameters)
	    clf.fit(x_train, y_train)
	    best_params.append({col:clf.best_params_})
	    
	return best_params
def xg_hitters_params(to_predict_hitters, x_hitters, hitter_predictions):
    best_params = []

    for col in to_predict_hitters:

        y = hitter_predictions[col].tolist()
        x_train, x_test, y_train, y_test = train_test_split(x_hitters, y)

        xgb_classifier = xgb()
        parameters = {'max_depth': [3,5,9], 'learning_rate': [.1,.4], "n_estimators": [250,350], \
                    'reg_lambda': [1,4]}
        clf = GridSearchCV(xgb_classifier, parameters)
        clf.fit(x_train, y_train)
        best_params.append({col: clf.best_params_})

    return best_params
Example #14
0
    def trainforAllModel(self, x_train, y_train):
        X_train, X_test, y_train, y_test = train_test_split(x_train,
                                                            y_train,
                                                            test_size=0.4,
                                                            random_state=4242)

        model = RandomForestClassifier(50, n_jobs=8)
        model.fit(X_train, y_train.values.ravel())
        predictions_proba = model.predict_proba(X_test)
        predictions = model.predict(X_test)

        log_loss_score = log_loss(y_test, predictions_proba)
        acc = accuracy_score(y_test, predictions)
        f1 = f1_score(y_test, predictions)

        print("RandonForest Classifier ")
        print('Log loss: %.5f' % log_loss_score)
        print('Acc: %.5f' % acc)
        print('F1: %.5f' % f1)

        modelXGB = xgb(n_estimators=500)
        modelXGB.fit(X_train, y_train.values.ravel())
        predictions_probaXGB = modelXGB.predict_proba(X_test)
        predictionsXGB = modelXGB.predict(X_test)

        log_loss_score_XGB = log_loss(y_test, predictions_probaXGB)
        acc_XGB = accuracy_score(y_test, predictionsXGB)
        f1_XGB = f1_score(y_test, predictionsXGB)

        print("XGBoost Classifier ")
        print('Log loss: %.5f' % log_loss_score_XGB)
        print('Acc: %.5f' % acc_XGB)
        print('F1: %.5f' % f1_XGB)

        clf = GaussianNB()
        clf.fit(X_train, y_train.values.ravel())
        predictions_probaNB = clf.predict_proba(X_test)
        predictionsNB = clf.predict(X_test)

        log_loss_score_Naiye_Bayes = log_loss(y_test, predictions_probaNB)
        acc_Naiye_Bayes = accuracy_score(y_test, predictionsNB)
        f1_Naiye_Bayes = f1_score(y_test, predictionsNB)

        print("Naiye_Bayes Classifier ")
        print('Log loss: %.5f' % log_loss_score_Naiye_Bayes)
        print('Acc: %.5f' % acc_Naiye_Bayes)
        print('F1: %.5f' % f1_Naiye_Bayes)
def voting_pitchers(to_predict_pitchers, pitcher_predictions, x_pitchers, xgb_pitchers_params, rforest_pitchers_params, logreg_pitchers_params, 
                    svm_pitchers_params):
    # pitchers voting classifier using the optimal parameters

    accuracy_list_voting_pitchers = []
    accuracy_list_voting_pitchers_ERA = []
    accuracy_list_voting_pitchers_K = []
    accuracy_list_voting_pitchers_W = []
    accuracy_list_voting_pitchers_WHIP = []

    i=0
    col_list = ['correct_ERA', 'correct_K', 'correct_W', 'correct_WHIP']
    for col in col_list:
        svm_pitchers_params[i][col]['probability'] = True
        i+=1

    for i in xrange(10):
        j=0
        for col in to_predict_pitchers:
            y = pitcher_predictions[col].tolist()
            clf1 = xgb(**xgb_pitchers_params[j][col])
            clf2 = RandomForestClassifier(**rforest_pitchers_params[j][col])
            clf3 = linear_model.LogisticRegression(**logreg_pitchers_params[j][col])
            #clf5 = QuadraticDiscriminantAnalysis(**qda_pitchers_params[j][col])
            clf4 = svm.SVC(**svm_pitchers_params[j][col])
            eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3), ('svm', clf4)], voting='soft')
            #eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft')
            scores = cross_val_score(eclf, x_pitchers, y, cv=5, scoring='accuracy', n_jobs=-1)
            
            acc = scores.mean()
            accuracy_list_voting_pitchers.append(acc)
            
            if col == 'correct_ERA':
                accuracy_list_voting_pitchers_ERA.append(acc)
            elif col == 'correct_K':
                accuracy_list_voting_pitchers_K.append(acc)
            elif col == 'correct_W':
                accuracy_list_voting_pitchers_W.append(acc)
            elif col == 'correct_WHIP':
                accuracy_list_voting_pitchers_WHIP.append(acc)
            j+=1

    print "%-15s" % 'overall average', np.mean(accuracy_list_voting_pitchers)
    print "%-15s" % 'correct_ERA', np.mean(accuracy_list_voting_pitchers_ERA)
    print "%-15s" % 'correct_K', np.mean(accuracy_list_voting_pitchers_K)
    print "%-15s" % 'correct_W', np.mean(accuracy_list_voting_pitchers_W)
    print "%-15s" % 'correct_WHIP', np.mean(accuracy_list_voting_pitchers_WHIP)
def xg_pitchers_params(to_predict_pitchers, x_pitchers, pitcher_predictions):
	### running a grid search cross validation on XGBoost for pitchers to obtain the best parameters

	best_params = []

	for col in to_predict_pitchers:
	    y = pitcher_predictions[col].tolist()
	    x_train, x_test, y_train, y_test = train_test_split(x_pitchers, y)
	    
	    xgb_classifier = xgb()
	    parameters = {'max_depth': [3,5,9], 'learning_rate': [.05,.1], "n_estimators": [250,350], \
	                'reg_lambda': [1,3,6]}
	    clf = GridSearchCV(xgb_classifier, parameters)
	    clf.fit(x_train, y_train)
	    best_params.append({col:clf.best_params_})
	    
	return best_params
def xg_pitchers_params(to_predict_pitchers, x_pitchers, pitcher_predictions):
    ### running a grid search cross validation on XGBoost for pitchers to obtain the best parameters

    best_params = []

    for col in to_predict_pitchers:
        y = pitcher_predictions[col].tolist()
        x_train, x_test, y_train, y_test = train_test_split(x_pitchers, y)

        xgb_classifier = xgb()
        parameters = {'max_depth': [3,5,9], 'learning_rate': [.05,.1], "n_estimators": [250,350], \
                    'reg_lambda': [1,3,6]}
        clf = GridSearchCV(xgb_classifier, parameters)
        clf.fit(x_train, y_train)
        best_params.append({col: clf.best_params_})

    return best_params
Example #18
0
def gradient_boosting(df):
    '''Xgboost model using sub set of features that have already been engineered to work,
     applies standard scaling and trains model, serializes model to disk with pickle and outputs metrics'''
    FILENAME='model'
    OUTFILE=open(FILENAME, 'wb')
    SCALE='scaler'
    SCALER=open(SCALE, 'wb')

    # Create df for model training
    y = df[['price']]
    x = df[['accommodates','bedrooms','bathrooms','cleaning_fee','distance','size']]

    # Typically at this stage we would conduct some form of feature exploration, selection
    # and feature engineering. For the sake of time during the recording we have already
    # performed minor feature analysis and selection. We have also already conducted
    # hyper-parameter tuning using grid search cross-validation and will be hard coding
    # those params for our xgboost model. We could extend this project and improve the
    # models accuracy by performing further feature engineering using various NLP techniques
    # but will stick to using some basic int data types as features to predict the target
    # variable price. With our pre-defined feature set we jump into model training.

    # Create training/test set for training
    sc = StandardScaler()
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
    x_train=sc.fit_transform(x_train)
    pickle.dump(sc, SCALER)
    SCALER.close()
    x_test=sc.transform(x_test)

    # Xgboost parameters hard coded after grid-search cross validation
    booster=xgb(n_estimators=200,random_state=4,gamma=0.2,max_depth=6,learning_rate=0.1,
                colsample_bytree=0.7
            )

    # Fit model make predictions on test set and output the metrics
    booster.fit(x_train,y_train)
    pickle.dump(booster,OUTFILE)
    OUTFILE.close()

    # Validate model is predicting
    y_preds = booster.predict(x_test)
    for i in y_preds:
        print("$", round(i, 2), "/ night")

    return y_preds
Example #19
0
def xgb_cv(max_depth, gamma, colsample_bytree, data, targets):
    estimator = xgb(
        n_estimators=250,
        learning_rate=0.08,
        n_jobs=4,
        max_depth=max_depth,
        gamma=gamma,
        colsample_bytree=colsample_bytree,
    )

    ##    cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    #    scores=cross_val_score(classifier, X, Y, cv=cv)
    cval = cross_val_score(estimator,
                           data,
                           targets,
                           scoring='neg_log_loss',
                           cv=5)
    return cval.mean()
Example #20
0
def meta_classifier():
    print("Start step of features importance")
    df_meta = pd.read_csv('meta_added_class.csv')
    feature_importance_measures = ['gain', 'weight', 'cover']
    importance_results = pd.DataFrame(columns=feature_importance_measures)
    for importance_key in feature_importance_measures:
        classifier = xgb(importance_type=importance_key)
        X = df_meta.to_numpy()[:, 2:-1] #X includes all features
        y = df_meta.to_numpy()[:, -1] #y includes class (=algorithm)
        classifier.fit(X,y)
        importance_results[importance_key] = classifier.feature_importances_

    dmat = DMatrix(X)
    shap = classifier.get_booster().predict(dmat,pred_contribs = True)
    np.savetxt('shap.csv',shap)

    importance_results.to_csv('importance results.csv')
    print("Finished step of features importance")
def voting_hitters(to_predict_hitters, hitter_predictions, x_hitters, xgb_hitters_params, rforest_hitters_params, 
                    logreg_hitters_params, qda_hitters_params):
    accuracy_list_voting_hitters = []
    accuracy_list_voting_hitters_AVG = []
    accuracy_list_voting_hitters_HR = []
    accuracy_list_voting_hitters_R = []
    accuracy_list_voting_hitters_RBI = []
    accuracy_list_voting_hitters_SB = []

    for i in xrange(10):
        j=0
        for col in to_predict_hitters:
            y = hitter_predictions[col].tolist()
            clf1 = xgb(**xgb_hitters_params[j][col])
            clf2 = RandomForestClassifier(**rforest_hitters_params[j][col])
            clf3 = linear_model.LogisticRegression(**logreg_hitters_params[j][col])
            clf4 = QuadraticDiscriminantAnalysis(**qda_hitters_params[j][col])
            #clf4 = svm.SVC(**svm_hitters_params[j][col])
            eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3), ('qda', clf4)], voting='soft')
            
            scores = cross_val_score(eclf, x_hitters, y, cv=5, scoring='accuracy', n_jobs = -1)

            acc = scores.mean()
            accuracy_list_voting_hitters.append(acc)
            
            if col == 'correct_AVG':
                accuracy_list_voting_hitters_AVG.append(acc)
            elif col == 'correct_HR':
                accuracy_list_voting_hitters_HR.append(acc)
            elif col == 'correct_R':
                accuracy_list_voting_hitters_R.append(acc)
            elif col == 'correct_RBI':
                accuracy_list_voting_hitters_RBI.append(acc)
            elif col == 'correct_SB':
                accuracy_list_voting_hitters_SB.append(acc)
            j+=1
            
    print "%-15s" % 'overall average', np.mean(accuracy_list_voting_hitters)
    print "%-15s" % 'correct_AVG', np.mean(accuracy_list_voting_hitters_AVG)
    print "%-15s" % 'correct_HR', np.mean(accuracy_list_voting_hitters_HR)
    print "%-15s" % 'correct_R', np.mean(accuracy_list_voting_hitters_R)
    print "%-15s" % 'correct_RBI', np.mean(accuracy_list_voting_hitters_RBI)
    print "%-15s" % 'correct_SB', np.mean(accuracy_list_voting_hitters_SB)
Example #22
0
def Get_xgboostScore(X, y):
    N_ESTIMATORS = 300
    CV_FOLD = 5
    loss_list = []
    for i in range(CV_FOLD):
        test_idx_start = int(X.shape[0] / CV_FOLD) * i
        test_idx_end = int(X.shape[0] / CV_FOLD) * (i + 1)
        X_train = pd.concat([X[:test_idx_start], X[test_idx_end:]], axis=0)
        y_train = pd.concat([y[:test_idx_start], y[test_idx_end:]], axis=0)
        X_test = X[test_idx_start:test_idx_end]
        y_test = y[test_idx_start:test_idx_end]
        # print( X_train.shape, y_train.shape, X_test.shape, y_test.shape )
        model = xgb(max_depth=5, n_estimators=N_ESTIMATORS)
        model.fit(X_train, y_train)

        loss_list.append(get_scores(model, X_train, y_train, X_test, y_test))

    print('Average Error:{:.6f}'.format(np.mean(loss_list)))
    return
Example #23
0
    def _feature_selection(self, X, y, Xv, yv):
        '''_FEATURE_SELECTION

            Apply XGBoost to do feature selection.

            Inputs:
            -------

            - X: numpy ndarray, features of training set.
            - y: numpy ndarray, labels of training set.
            - Xv: numpy ndarray, features of validation set.
            - yv: numpy ndarray, laebls of validation set.

            Outputs:
            --------

            - clf: instance of XGBClassifier, trian model.
            - fs_idx: list, indecies of selected features.
            - importance: list, importance of selected features.

        '''

        # Train XGBoost classifier
        clf = xgb(**self.xgb_paras)
        clf.fit(X,
                y,
                eval_set=[(X, y), (Xv, yv)],
                eval_metric="error",
                verbose=False)

        # Extract indices of important features
        importance = clf.feature_importances_
        fs_idx = np.where(importance > self.threshold)[0]
        importance = importance[fs_idx]
        print("Number of important features: ", len(fs_idx))

        return clf, fs_idx, importance
Example #24
0
    def stat_pct(stat):
        """
        Inputs:
            stat (str): The statistic of interest - AVG, HR, R, RBI, SB for hitters, ERA, K, W, WHIP for pitchers
        Returns:
            vals ((# of players,3) ndarray): first column is player name, second column is predicted value for given
                                             statistic, third column is probability that the prediction is correct
        """
        # check that values given are valid
        if stat not in all_stats:
            print "Not an acceptable stat"
            return 'FAILED'

        if stat in hit_stats:
            # get a list of the names of all the hitters in the order that they appear in x_hitters2017
            name_list = hitter_predictions_2017['Name'].tolist()
            # run the model
            if models[stat] == 'XGBoost':
                # run XGBoost with best params for the stat
                xgbc = xgb(**best_params_all['correct_' + stat])
                model = xgbc.fit(x_hitters, y_vals[stat])
                preds = model.predict_proba(x_hitters2017)[:, 1]
            if models[stat] == 'Random Forest':
                # run Random Forest with best params for the stat
                rf = RandomForestClassifier(**best_params_all['correct_' +
                                                              stat])
                model = rf.fit(x_hitters, y_vals[stat])
                preds = model.predict_proba(x_hitters2017)[:, 1]
            # empty array to store names, stat predictions, and pct probability
            vals = np.empty((len(np.unique(name_list)), 3))
            # get list of unique names
            unique_names = np.unique(name_list)
            # create lists to store percent probabilities and stat predictions
            pcts = np.zeros(len(unique_names))
            stats = np.zeros(len(unique_names))
            # loop through each player
            for j in xrange(len(unique_names)):
                # get indices of player
                idxs = []
                for x in xrange(len(name_list)):
                    if name_list[x] == unique_names[j]:
                        idxs.append(x)
                # find highest probability for given player, store the index and probability value
                vals_dict = dict((i, preds[i]) for i in idxs)
                b = collections.defaultdict(list)
                for key, value in vals_dict.iteritems():
                    b[value].append(key)
                pcts[j] = max(b.items())[0]
                #find corresponding value of stat
                stats[j] = hitter_predictions_2017[stat][max(b.items())[1][0]]
            vals[:, 0] = stats
            vals[:, 1] = pcts
            return vals, unique_names

        else:
            # get a list of the names of all pitchers in the order that they apear in x_pitchers2017
            name_list = pitcher_predictions_2017['Name'].tolist()
            # run the model
            if models[stat] == 'XGBoost':
                # run XGBoost with best params for the stat
                xgbc = xgb(**best_params_all['correct_' + stat])
                model = xgbc.fit(x_pitchers, y_vals[stat])
                preds = model.predict_proba(x_pitchers2017)[:, 1]
            if models[stat] == 'Random Forest':
                # run Random Forest with best params for the stat
                rf = RandomForestClassifier(**best_params_all['correct_' +
                                                              stat])
                model = rf.fit(x_pitchers, y_vals[stat])
                preds = model.predict_proba(x_pitchers2017)[:, 1]

            # empty array to store names, stat predictions, and pct probability
            vals = np.empty((len(np.unique(name_list)), 3))
            # get list of unique names
            unique_names = np.unique(name_list)
            # create lists to store percent probabilities and stat predictions
            pcts = np.zeros(len(unique_names))
            stats = np.zeros(len(unique_names))
            # loop through each player
            for j in xrange(len(unique_names)):
                # get indices of player
                idxs = []
                for x in xrange(len(name_list)):
                    if name_list[x] == unique_names[j]:
                        idxs.append(x)
                # find highest probability for given player, store the index and probability value
                vals_dict = dict((i, preds[i]) for i in idxs)
                b = collections.defaultdict(list)
                for key, value in vals_dict.iteritems():
                    b[value].append(key)
                pcts[j] = max(b.items())[0]
                #find corresponding value of stat
                stats[j] = pitcher_predictions_2017[stat][max(b.items())[1][0]]
            vals[:, 0] = stats
            vals[:, 1] = pcts
            return vals, unique_names

"""**************************************************************************************************"""
"""     4) XGBoost         """

    """ Bag of Words Features """
# Splitting my data into train and test
train_bow = bow[:31962,:]
test_bow = bow[31962:,:]

# Splitting my data into train and validation data
X_train, X_valid, y_train, y_valid = train_test_split(train_bow, train["label"], test_size=0.3, random_state=0)

""" Instantiting the  xgboost classifier """
from xgboost import XGBClassifier as xgb
classifier = xgb(n_estimators=2000, max_depth=6)
classifier.fit(X_train, y_train)
# Getiing f1 scores
y_pred = classifier.predict(X_valid)

f1Score = f1_score(y_valid, y_pred)
print(f1Score*100)

    """ TFIDF """
# Splitting my data into train and test
train_idf = tfidf[:31962,:]
test_idf = tfidf[31962:,:]

# Splitting my data into train and validation data
X_train, X_valid, y_train, y_valid = train_test_split(train_idf, train["label"], test_size=0.3, random_state=0)
from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import Imputer

path_tr = 'C:/Users/satyam/Desktop/kaggle/House_Prices/train.csv'
train = pd.read_csv(path_tr)  #Data
path_te = 'C:/Users/satyam/Desktop/kaggle/House_Prices/test.csv'
test = pd.read_csv(path_te)

my_imputer = Imputer()
target = train.SalePrice
data = pd.concat([train.drop(['SalePrice'], axis=1), test])
numeric = data.select_dtypes(exclude=['object'])

filled_data = my_imputer.fit_transform(
    numeric)  #Using Imputer to fill up missing values
train_f = filled_data[:1460]
test_f = filled_data[1460:]

model = xgb(n_estimators=1000, learning_rate=0.05
            )  #Using XGBoost Model along with estimator and learning rate
model.fit(train_f,
          target,
          early_stopping_rounds=5,
          eval_set=[(train_f, target)],
          verbose=False)

predictions = model.predict(test_f)  #Making predictions
my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': predictions})
my_submission.to_csv('XGBoost+Imputer+est+LR+ESR.csv', index=False)
Example #27
0
Sys_X_model, Sys_X_test, Sys_Y_model, Sys_Y_test = train_test_split(
    Sys_X, Sys_Y, test_size=size, random_state=seed)

S1_X_train, S1_X_valid, S1_Y_train, S1_Y_valid = train_test_split(
    S1_X_model, S1_Y_model, test_size=size, random_state=seed)
S2_X_train, S2_X_valid, S2_Y_train, S2_Y_valid = train_test_split(
    S2_X_model, S2_Y_model, test_size=size, random_state=seed)
S3_X_train, S3_X_valid, S3_Y_train, S3_Y_valid = train_test_split(
    S3_X_model, S3_Y_model, test_size=size, random_state=seed)
S4_X_train, S4_X_valid, S4_Y_train, S4_Y_valid = train_test_split(
    S4_X_model, S4_Y_model, test_size=size, random_state=seed)
Sys_X_train, Sys_X_valid, Sys_Y_train, Sys_Y_valid = train_test_split(
    Sys_X_model, Sys_Y_model, test_size=size, random_state=seed)

#Use XGBoost to show feature importance per station
model1 = xgb().fit(S1_X_train, S1_Y_train)
model2 = xgb().fit(S2_X_train, S2_Y_train)
model3 = xgb().fit(S3_X_train, S3_Y_train)
model4 = xgb().fit(S4_X_train, S4_Y_train)

#Shows the XGBoost-derived feature importances in graph form.
plot_importance(model1)
plot_importance(model2)
plot_importance(model3)
plot_importance(model4)
pyplot.show()

#use sort to find the thresholds for SelectFromModel
thresholdS1 = sort(model1.feature_importances_)
thresholdS2 = sort(model2.feature_importances_)
thresholdS3 = sort(model3.feature_importances_)
Example #28
0
File: test.py Project: Echo-uu/KDD
        list(data_train[col].astype(str).values) +
        list(data_test_a[col].astype(str).values))
    data_train[col] = le.transform(list(data_train[col].astype(str).values))
    data_test_a[col] = le.transform(list(data_test_a[col].astype(str).values))
print('Label Encoding 完成')

features = [
    f for f in data_train.columns
    if f not in ['id', 'issueDate', 'isDefault'] and '_outliers' not in f
]
x_train = data_train[features]
x_valid = data_test_a[features]
y_train = data_train['isDefault']

trn_x, val_x, trn_y, val_y = train_test_split(x_train, y_train)

clf = xgb()
clf.fit(trn_x, trn_y)
pre = clf.predict(val_x)
print(roc_auc_score(val_y, pre))

lgb = LGBMClassifier()
lgb.fit(trn_x, trn_y)
pre = lgb.predict(val_x)
print(roc_auc_score(val_y, pre))

cat = CatBoostRegressor()
cat.fit(trn_x, trn_y)
pre = cat.predict(val_x)
print(roc_auc_score(val_y, pre))
Example #29
0
}, {
    'C': [1, 10, 100, 1000],
    'kernel': ['rbf'],
    'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}]
grid_search = GridSearchCV(estimator=classifier,
                           param_grid=parameters,
                           scoring='accuracy',
                           cv=10,
                           n_jobs=-1)
grid_search = grid_search.fit(iv_train, dv_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

# In[ ]:

#finally trying with advanced XGBOOST algorithm
# Fitting XGBoost to the Training set
from xgboost import xgb
classifier = xgb()
classifier.fit(iv_train, dv_train)

# Predicting the Test set results
y_pred = classifier.predict(iv_test)

# In[ ]:

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(dv_test, dv_predict)
Example #30
0
        "geoNetwork_country",
        "flight_day",
        "TRIPTYPEDESC",
        "SALESCHANNEL",
    ]
)
y = df["INS_FLAG"]

## train test split size, random seed
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=22
)

## Model 1 (baseline) - XGBoost Classifier

xgr = xgb(n_estimators=3000, max_depth=7)
# fit the model to train data set
xgr.fit(X_train, y_train, eval_metric="auc", verbose=200)
predictions = xgr.predict(X_test)
# print(predictions)

## overall model accucary
from sklearn import metrics

predictions = xgr.predict(X_test)

print(
    "Accuracy:", metrics.accuracy_score(y_test, predictions)
)  # 83% accuracy on imbalanced dataset
print("Precision:", metrics.precision_score(y_test, predictions))
print("Recall:", metrics.recall_score(y_test, predictions))
Example #31
0
def process(object):
    train_X, train_y, test_X, test_y = object['train_X'], object[
        'train_y'], object['test_X'], object['test_y']
    # init sample method
    # sample_methods = ['random', 'SMOTE', 'Sparse SMOTE', 'SMOTEBorderline-1', 'SMOTEBorderline-2',
    #                   'SVMSMOTE', 'ADASYN', 'No Sample']
    sample_methods = ['Sparse SMOTE']
    # sample_methods = ['SMOTE']
    # sample_methods = ['random', 'smote', 'adasyn', 'mwmote']
    metrics_dict = {}
    time_info = {}
    for sample_method in sample_methods:
        # before
        before_time = datetime.now()
        # over sample
        X_resampled, y_resampled = oversample(train_X,
                                              train_y,
                                              method=sample_method)
        statistics_sample_num(train_X, train_y, X_resampled, y_resampled,
                              sample_method)
        # after
        over_time = datetime.now()
        process_time = ((over_time - before_time).microseconds) * 1.0 / (10**6)
        # print(process_time)
        time_info[sample_method] = "%.3f" % process_time
        # create model
        gbm = xgb(max_depth=3, n_estimators=300, learning_rate=0.01)
        # gbm = xgb(max_depth=3, n_estimators=300, learning_rate=0.01, max_delta_step=0.1)
        # train model
        gbm.fit(X_resampled, y_resampled, eval_metric='auc')
        # evaluate on test set
        precision, recall, f1, gmean, auc_roc, auc_pr, fpr, tpr = evaluate(
            test_X, test_y, gbm)
        roc_auc = auc(fpr, tpr)
        if SHOW_AUC_ROC_PLOT:
            plt.plot(fpr,
                     tpr,
                     lw=1,
                     alpha=0.3,
                     label='%s (AUC = %0.2f)' % (sample_method, roc_auc))
        metrics_dict[sample_method] = {
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "gmean": gmean,
            "auc_roc": auc_roc,
            "auc_pr": auc_pr
        }
    df = pd.DataFrame(metrics_dict)
    # df.set_index(['precision', 'recall', 'gmean', 'f1'], inplace=True)
    df = df.T
    # print(df)
    if SHOW_METRICS:
        for index, row in df.iterrows():
            print "&" + index + "&",
            # output auc_roc, auc_pr, precision, recall, f1, gmean
            if row["auc_roc"] >= df["auc_roc"].max():
                print r"\textbf{%.3f" % row["auc_roc"] + "}&",
            else:
                print "%.3f" % row["auc_roc"] + "&",
            if row["auc_pr"] >= df["auc_pr"].max():
                print r"\textbf{%.3f" % row["auc_pr"] + "}&",
            else:
                print "%.3f" % row["auc_pr"] + "&",
            if row["precision"] >= df["precision"].max():
                print r"\textbf{%.3f" % row["precision"] + "}&",
            else:
                print "%.3f" % row["precision"] + "&",

            if row["recall"] >= df["recall"].max():
                print r"\textbf{%.3f" % row["recall"] + "}&",
            else:
                print "%.3f" % row["recall"] + "&",
            if row["f1"] >= df["f1"].max():
                print r"\textbf{%.3f" % row["f1"] + "}&",
            else:
                print "%.3f" % row["f1"] + "&",
            if row["gmean"] >= df["gmean"].max():
                print(r"\textbf{%.3f" % row["gmean"] + r"}\\")
            else:
                print("%.3f" % row["gmean"] + r"\\")
    # evaluate(X, y, "No Sample", gbm)
    return time_info
Example #32
0
                                  min_samples_leaf=2,
                                  n_estimators=100,
                                  subsample=0.8)
gbdt.fit(iris.data, iris.target)
gbdt.predict(iris.data)
gbdt.predict_proba(iris.data)


# ## xgboost  test
def scorebyself(self, X, y):
    from sklearn.metrics import roc_auc_score
    probas = self.predict_proba(X)
    auc = roc_auc_score(y, probas)
    return auc


from xgboost import XGBClassifier as xgb

params = {
    'n_estimators': [1000, 500, 100],
    'subsample': [0.5, 0.8],
    'learning_rate': [0.01, 0.05]
}
gsmodel = xgb()
xgbmodel0 = GridSearchCV(gsmodel, params, cv=5, n_jobs=5)
xgbmodel0.fit(iris.data, iris.target)
xgbest = xgb(learning_rate=0.01, n_estimators=1000, subsample=0.5, max_depth=3)
xgbest.fit(iris.data, iris.target)
xgbest.predict(iris.data)
xgbest.predict_proba(iris.data)
 def stat_pct(stat):
     """
     Inputs:
         stat (str): The statistic of interest - AVG, HR, R, RBI, SB for hitters, ERA, K, W, WHIP for pitchers
     Returns:
         vals ((# of players,3) ndarray): first column is player name, second column is predicted value for given
                                          statistic, third column is probability that the prediction is correct
     """
     # check that values given are valid
     if stat not in all_stats:
         print "Not an acceptable stat"
         return 'FAILED'
     
     if stat in hit_stats:
         # get a list of the names of all the hitters in the order that they appear in x_hitters2017
         name_list = hitter_predictions_2017['Name'].tolist()
         # run the model
         if models[stat] == 'XGBoost':
             # run XGBoost with best params for the stat
             xgbc = xgb(**best_params_all['correct_' + stat])
             model = xgbc.fit(x_hitters, y_vals[stat])
             preds = model.predict_proba(x_hitters2017)[:,1]
         if models[stat] == 'Random Forest':
             # run Random Forest with best params for the stat
             rf = RandomForestClassifier(**best_params_all['correct_' + stat])
             model = rf.fit(x_hitters, y_vals[stat])
             preds = model.predict_proba(x_hitters2017)[:,1]  
         # empty array to store names, stat predictions, and pct probability
         vals = np.empty((len(np.unique(name_list)), 3))
         # get list of unique names 
         unique_names = np.unique(name_list)
         # create lists to store percent probabilities and stat predictions
         pcts = np.zeros(len(unique_names))
         stats = np.zeros(len(unique_names))
         # loop through each player
         for j in xrange(len(unique_names)):
             # get indices of player
             idxs = []
             for x in xrange(len(name_list)):
                 if name_list[x] == unique_names[j]:
                     idxs.append(x)
             # find highest probability for given player, store the index and probability value
             vals_dict = dict((i, preds[i]) for i in idxs)
             b = collections.defaultdict(list)
             for key, value in vals_dict.iteritems():
                 b[value].append(key)
             pcts[j] = max(b.items())[0]
             #find corresponding value of stat
             stats[j] = hitter_predictions_2017[stat][max(b.items())[1][0]]
         vals[:,0] = stats
         vals[:,1] = pcts
         return vals, unique_names
             
     else:
         # get a list of the names of all pitchers in the order that they apear in x_pitchers2017
         name_list = pitcher_predictions_2017['Name'].tolist()
         # run the model
         if models[stat] == 'XGBoost':
             # run XGBoost with best params for the stat
             xgbc = xgb(**best_params_all['correct_' + stat])
             model = xgbc.fit(x_pitchers, y_vals[stat])
             preds = model.predict_proba(x_pitchers2017)[:,1]
         if models[stat] == 'Random Forest':
             # run Random Forest with best params for the stat
             rf = RandomForestClassifier(**best_params_all['correct_' + stat])
             model = rf.fit(x_pitchers, y_vals[stat])
             preds = model.predict_proba(x_pitchers2017)[:,1]
             
         # empty array to store names, stat predictions, and pct probability
         vals = np.empty((len(np.unique(name_list)), 3))
         # get list of unique names 
         unique_names = np.unique(name_list)
         # create lists to store percent probabilities and stat predictions
         pcts = np.zeros(len(unique_names))
         stats = np.zeros(len(unique_names))
         # loop through each player
         for j in xrange(len(unique_names)):
             # get indices of player
             idxs = []
             for x in xrange(len(name_list)):
                 if name_list[x] == unique_names[j]:
                     idxs.append(x)
             # find highest probability for given player, store the index and probability value
             vals_dict = dict((i, preds[i]) for i in idxs)
             b = collections.defaultdict(list)
             for key, value in vals_dict.iteritems():
                 b[value].append(key)
             pcts[j] = max(b.items())[0]
             #find corresponding value of stat
             stats[j] = pitcher_predictions_2017[stat][max(b.items())[1][0]]
         vals[:,0] = stats
         vals[:,1] = pcts
         return vals, unique_names
Example #34
0
    def __init__(self, clfile='xgb_classifier_1.pickle', *args, **kwargs):
        """
		Initialize the classifier object with optimised parameters.

		Parameters:
			clfile (str): saved classifier file.
			n_estimators (int): number of boosted trees in the ensemble.
			max_depth (int): maximum depth of each tree in the ensemble.
			learning_rate: boosting learning rate.
			reg_alpha: L1 regularization on the features.
			objective: learning objective of the algorithm.
			booster: booster used in the tree.
			eval_metric: Evaluation metric.

		.. codeauthor:: Refilwe Kgoadi <*****@*****.**>
		"""

        # Initialize the parent class:
        super().__init__(*args, **kwargs)

        # Attributes of this classifier:
        self.classifier = None
        self.classifier_file = None
        self.featdir = None

        if clfile is not None:
            self.classifier_file = os.path.join(self.data_dir, clfile)

        if self.features_cache is not None:
            self.featdir = os.path.join(self.features_cache, 'xgb_features')
            os.makedirs(self.featdir, exist_ok=True)

        if self.classifier_file is not None and os.path.exists(
                self.classifier_file):
            # Load pre-trained classifier
            self.load(self.classifier_file)
            self.trained = True  # Assume any classifier loaded is already trained
        else:
            # Create new untrained classifier:
            self.classifier = xgb(
                booster='gbtree',
                colsample_bytree=0.7,
                eval_metric='mlogloss',
                gamma=7.5,
                learning_rate=0.1,
                max_depth=6,
                min_child_weight=1,
                n_estimators=500,
                objective='multi:softmax',
                random_state=self.random_seed,  # XGBoost uses misleading names
                reg_alpha=1e-5,
                subsample=0.8,
                use_label_encoder=False)
            self.trained = False

        # List of feature names used by the classifier:
        self.features_names = [
            'skewness', 'kurtosis', 'shapiro_wilk', 'eta', 'PeriodLS',
            'Freq_amp_0', 'Freq_ampratio_21', 'Freq_ampratio_31',
            'Freq_phasediff_21', 'Freq_phasediff_31', 'Rcs', 'psi_Rcs'
        ]
Example #35
0
    def XGBoost(self, args):  ## Gradient Boosting

        logger.info("Running Gradient Boosting ... ")

        if args.predictor.lower() == 'classifier':
            from xgboost import XGBClassifier as xgb

            if args.snps:
                penalty = (float(
                    len(self.y_data[self.y_data == 0]) /
                    len(self.y_data[self.y_data == 1])))  #np.sqrt
                xg_model = xgb(objective='binary:logistic',
                               max_depth=6,
                               colsample_bytree=0.6,
                               scale_pos_weight=penalty)
            elif args.indels:
                penalty = float(
                    len(self.y_data[self.y_data == 0]) /
                    len(self.y_data[self.y_data == 1]))  #np.sqr
                xg_model = xgb(objective='binary:logistic',
                               n_estimators=200,
                               eta=0.001,
                               n_jobs=-1)

        elif args.predictor.lower() == 'regressor':
            if args.snps:
                penalty = (float(
                    len(self.y_data[self.y_data == 0]) /
                    len(self.y_data[self.y_data == 1])))  #np.sqrt
                from xgboost import XGBRegressor as xgb
                xg_model = xgb(n_estimator=40000,
                               max_depth=6,
                               colsample_bytree=0.6,
                               scale_pos_weight=penalty,
                               reg_lambda=0.001)
            elif args.indels:
                penalty = float(
                    len(self.y_data[self.y_data == 0]) /
                    len(self.y_data[self.y_data == 1]))  #np.sqr
                from xgboost import XGBRegressor as xgb
                if penalty > 2:

                    ##stomach
                    xg_model = xgb(objective='binary:logistic',
                                   colsample_bytree=0.6,
                                   max_depth=8,
                                   min_child_weight=5,
                                   importance_type='gain',
                                   reg_lambda=10,
                                   subsample=0.05,
                                   min_split_loss=100)

                else:

                    ##Improved: Works on GIAB, Bone, Breast, K562
                    xg_model = xgb(objective='binary:logistic',
                                   colsample_bytree=0.6,
                                   min_child_weight=5,
                                   importance_type='gain',
                                   max_depth=8,
                                   reg_lambda=10)

        ## Fit the regressor to the training set
        xg_model.fit(self.X_train, self.y_train)

        ## Predict the labels
        self.y_pred = xg_model.predict(self.X_data)
        if args.predictor.lower() == 'regressor':
            self.y_pred = logistic.cdf(self.y_pred)
        self.data['boosting_score'] = self.y_pred
        self.model = xg_model

        return self
#TODO
# try the golden attribute!

train.sort_values(by = 'Estimated_Insects_Count', inplace = True)
test.sort_values(by = 'Estimated_Insects_Count', inplace = True)

train.Number_Weeks_Used.fillna(method = 'ffill', inplace = True)
test.Number_Weeks_Used.fillna(method = 'ffill', inplace = True)

mms = MinMaxScaler()
X = mms.fit_transform(train.ix[:, train.columns != 'Crop_Damage'].values)
y = train.Crop_Damage.values

X_test = mms.transform(test.values)

X = X.astype('float32')
X_test = X_test.astype('float32')

train_x, test_x, train_y, test_y = train_test_split(X, y)

print 'Training Classifiers'
clf1 = xgb(nthread = 3, learning_rate = 0.3, n_estimators = 1000)
cccv1 = cccv(clf1, method='isotonic', cv = 5)

cccv1.fit(train_x, train_y);
pred1 = cccv1.predict(test_x)
print classification_report(test_y, pred1)

pred = cccv1.predict(X_test)
pd.DataFrame({'Crop_Damage':pred}, index=test.index).to_csv('final_sub.csv')