Ejemplo n.º 1
0
    def runxgBoostClassifier(self, bDetailReport=False):
        print("m_X_train  size", len(self.m_X_train))

        boosters = ['gbtree', 'gblinear']

        for depth in range(3, 4):
            for rate in (range(2, 3, 1)):
                for estimator in (range(220, 240, 20)):
                    #                    for bster in boosters:
                    clf = XGBClassifier(max_depth=depth,
                                        learning_rate=(float(rate) / 10),
                                        n_estimators=estimator,
                                        silent=True,
                                        objective='binary:logistic',
                                        seed=400)
                    clf.fit(self.m_X_train, self.m_y_train)
                    y = clf.predict(self.m_X_test)
                    print(
                        "\nxgBoostClassifier depth={} rate={} estimator={}\n".
                        format(depth, (float(rate) / 10), estimator))
                    print(classification_report(self.m_y_test, y))
                    print(clf.feature_importances_)
                    # plot
                    pyplot.bar(range(len(clf.feature_importances_)),
                               clf.feature_importances_)
                    pyplot.show()

                    plot_importance(clf)
                    if (bDetailReport):
                        self.ClassifierDetailReport(self.m_y_test, y)
Ejemplo n.º 2
0
 def eval_fn(params):
     model = XGBClassifier(n_estimators=n_estimators_max, learning_rate=learning_rate, seed=seed)
     score = 0
     n_estimators = 0
     for tr, va in skf:
         X_tr, y_tr = X_train[tr], y_train[tr]
         X_va, y_va = X_train[va], y_train[va]
         model.set_params(**params)
         model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], eval_metric='logloss',
                   early_stopping_rounds=50, verbose=False)
         score += model.best_score
         n_estimators += model.best_iteration
     score /= n_folds
     n_estimators /= n_folds
     n_estimators_lst.append(n_estimators)
     result_str = "train:%.4f ntree:%5d  " % (score, n_estimators)
     if X_valid is not None:
         model.n_estimators = n_estimators
         model.fit(X_train, y_train)
         pr = model.predict_proba(X_valid)[:,1]
         sc_valid = log_loss(y_valid, pr)
         score_valid.append(sc_valid)
         result_str += "valid:%.4f" % sc_valid
     if verbose:
         print result_str
     return score
Ejemplo n.º 3
0
def leaveoneout(dataset, labels):
    '''分类器采用xgboost,交叉验证采用留一法'''
    leaveoo = LeaveOneOut()
    #    Y_true = []
    #    Y_pre  = []
    #xgboost参数分为三类:
    '''1、通用参数
       2、Booster参数:控制每一步的booster
       3、学习目标参数:控制训练目标的表现'''
    for train_index, test_index in leaveoo.split(dataset):
        x_train, x_test = dataset[[train_index]], dataset[[test_index]]
        y_train, y_test = [labels[i] for i in train_index
                           ], [labels[i] for i in test_index]
        estimator = XGBClassifier(
            silent=0,  #设置成1则没有运行信息输出,最好是设置为0.是否在运行升级时打印消息。
            min_child_weight=1,
            gamma=0,  # 树的叶子节点上作进一步分区所需的最小损失减少,越大越保守,一般0.1、0.2这种样子。
            max_delta_step=1,  #最大增量步长,我们允许每个树的权重估计。
            colsample_bytree=0.8,  # 生成树时进行的列采样 
            nthread=4,
            objective=
            'binary:logistic',  #定义需要被最小化的损失函数,binary:logistic 二分类的逻辑回归,返回预测的概率(不是类别)。
            reg_lambda=1,  # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。
            scale_pos_weight=1,
            n_estimators=200,  #树的个数
            seed=1000  #随机种子
        )
        estimator.fit(x_train, y_train)
        print(estimator.best_params_)
        y_true, y_pre = y_test, list(estimator.predict(x_test))
    print("Accuracy : %.6g" % metrics.accuracy_score(y_true, y_pre))
Ejemplo n.º 4
0
def do_simple_xgboost_regression(x_train, y_train, x_test, y_test):
    xg_reg = XGBClassifier(silent=False,
                           scale_pos_weight=1,
                           learning_rate=0.01,
                           colsample_bytree=0.4,
                           subsample=0.8,
                           objective='binary:logistic',
                           n_estimators=1000,
                           reg_alpha=0.3,
                           max_depth=4,
                           gamma=10)

    eval_set = [(x_train, y_train), (x_test, y_test)]
    eval_metric = ["auc", "error"]
    xg_reg.fit(x_train,
               y_train,
               eval_metric=eval_metric,
               eval_set=eval_set,
               verbose=True)
    train_accuracy = compute_accuracy(xg_reg, x_train, y_train)
    test_accuracy = compute_accuracy(xg_reg, x_test, y_test)

    print('train set accuracy: {}'.format(train_accuracy))
    print('test set accuracy: {}'.format(test_accuracy))

    y_score = xg_reg.predict(x_test)

    score = metrics.roc_auc_score(y_test, y_score)
    print('score {}'.format(score))
Ejemplo n.º 5
0
def xgbt_base_rmse_mode(train_input, train_target, test_input, test_target):
    param = {
        'n_estimators':10,
        'learning_rate': 0.01,
        }

    adj_params = {
        'n_estimators':[10],
        'learning_rate': [0.01],
    #    'n_estimators':[10,50,100,200,300,400,500,1000],
    #    'learning_rate': [0.01, 0.1, 1] 
    }

    xgbt = XGBClassifier(**param)
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
    cscv = GridSearchCV(xgbt, adj_params, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
    cscv.fit(train_input, train_target)
    print("cv_results_:",cscv.cv_results_)
    print("best_params_: ",cscv.best_params_)
    xgbt= XGBClassifier(**cscv.best_params_)
    xgbt.fit(train_input,train_target.ravel())  
    predicted = xgbt.predict(test_input) 
    xgbt_base_rmse = np.sqrt(metrics.mean_squared_error(test_target, predicted))
    print("xgbt_base_rmse: ", xgbt_base_rmse)
    #print ("RMSE:", np.sqrt(metrics.mean_squared_error(test_target, predicted))) 
    return xgbt_base_rmse
Ejemplo n.º 6
0
def get_ntree():
    f1_t_total, f1_v_total = [], []
    for ntree in range(10, 810, 10):
        xgb_base = XGBClassifier(objective='binary:logistic',
                                 n_estimators=ntree,
                                 random_state=1234,
                                 silent=0,
                                 booster='gbtree',
                                 subsample=0.8,
                                 colsample_bytree=0.8,
                                 reg_alpha=1,
                                 reg_lambda=0,
                                 learning_rate=0.1,
                                 max_depth=6)

        print('此时 ntree = %s' % ntree)
        xgb_base.fit(X_t, y_t)
        y_t_pre = xgb_base.predict(X_t)
        y_v_pre = xgb_base.predict(X_v)
        f1_t_each = f1_score(y_t, y_t_pre, average='micro')
        f1_v_each = f1_score(y_v, y_v_pre, average='micro')
        f1_t_total.append(f1_t_each)
        f1_v_total.append(f1_v_each)
        myfile = open('D:\\workspace python\\contest\\accu_save\\' +
                      'xgbbase_810_1.txt',
                      'a',
                      encoding='utf-8')
        print(f1_t_each, ',', f1_v_each, file=myfile)
        myfile.close()
    return f1_t_total, f1_v_total
Ejemplo n.º 7
0
def model_train(xtrain, ytrain):
    X_train, X_test, y_train, y_test = train_test_split(xtrain,
                                                        ytrain,
                                                        test_size=0.2,
                                                        random_state=0)
    cls = XGBClassifier()
    start_time = time.time()
    cls.fit(X_train, y_train)
    end_time = time.time()
    print('It took %d seconds to train the model!' % (end_time - start_time))
    print()
    y_pred = cls.predict(X_test)
    print("模型及模型参数:")
    print(str(cls))
    print("模型评估:")
    print('Accuracy:', accuracy_score(y_test, y_pred))
    print('F1 score:', f1_score(y_test, y_pred))
    print('Recall:', recall_score(y_test, y_pred))
    print('Precision:', precision_score(y_test, y_pred))
    print('\n clasification report:\n', classification_report(y_test, y_pred))
    print('\n confussion matrix:\n', confusion_matrix(y_test, y_pred))

    # 保存模型
    model_name = "./model/" + "xgb_model"
    joblib.dump(cls, model_name)
Ejemplo n.º 8
0
 def fit_model(self, X_train, y_train, X_test, y_test):
     clf = XGBClassifier(learning_rate=self.learning_rate,
                         n_estimators=self.n_estimators,
                         max_depth=self.max_depth,
                         min_child_weight=self.min_child_weight,
                         gamma=self.gamma,
                         subsample=self.subsample,
                         colsample_bytree=self.colsample_bytree,
                         objective=self.objective,
                         nthread=self.nthread,
                         scale_pos_weight=self.scale_pos_weight,
                         reg_alpha=self.reg_alpha,
                         reg_lambda=self.reg_lambda,
                         seed=self.seed)
     clf.fit(X_train, y_train)
     y_pre = clf.predict(X_test)
     y_pro = clf.predict_proba(X_test)[:, 1]
     print "pred_leaf=T  AUC Score : %f" % metrics.roc_auc_score(
         y_test, y_pro)
     print "pred_leaf=T  Accuracy : %.4g" % metrics.accuracy_score(
         y_test, y_pre)
     new_feature = clf.apply(X_train)
     X_train_new = self.mergeToOne(X_train, new_feature)
     new_feature_test = clf.apply(X_test)
     X_test_new = self.mergeToOne(X_test, new_feature_test)
     print "Training set sample number remains the same"
     return X_train_new, y_train, X_test_new, y_test
Ejemplo n.º 9
0
def train_xgb_classifier(dat, predictors, target_col, params):

    if params['train_frac'] < 1.0:
        if params['seed'] is not None:
            np.random.seed(params['seed'])
        else:
            np.random.seed(123)

        dat.sort_values(['GAME_DATE'], inplace=True)
        samp_size = int(params['train_frac'] * dat.shape[0])

        # sample_ind = np.random.choice(dat.shape[0], size=int(np.floor(params['train_frac']*dat.shape[0])),replace=False)
        sample_ind = list(range(samp_size))

        train_dat = dat.iloc[sample_ind, :].copy()
        calib_ind = list(set(range(dat.shape[0])) - set(sample_ind))
        calib_dat = dat.iloc[calib_ind, :].copy()
        calib_dat_x = calib_dat[predictors]
        calib_dat_x.columns = ['f' + str(i) for i in range(len(predictors))]
    else:
        train_dat = dat.copy()

    train_dat_x = train_dat[predictors]
    train_dat_x.columns = ['f' + str(i) for i in range(len(predictors))]

    mod = XGBClassifier(**params)
    mod.fit(train_dat_x, train_dat[target_col])

    if params['train_frac'] < 1.0:
        mod_final = CalibratedClassifierCV(mod, method='sigmoid', cv='prefit')
        mod_final.fit(calib_dat_x, calib_dat[target_col])
    else:
        mod_final = mod

    return (mod_final)
Ejemplo n.º 10
0
def Create_Model(X_train, X_test, y_train, y_test, learning_rate, n_estimators,
                 max_depth, min_child_weight, gamma, subsample,
                 colsample_bytree, reg_alpha, eval_metric):

    ROCforest = XGBClassifier(learning_rate=learning_rate,
                              n_estimators=n_estimators,
                              max_depth=max_depth,
                              min_child_weight=min_child_weight,
                              gamma=gamma,
                              subsample=subsample,
                              colsample_bytree=colsample_bytree,
                              reg_alpha=reg_alpha,
                              objective='binary:logistic',
                              nthread=4,
                              seed=12)

    cv_folds = 5

    eval_metric = eval_metric

    xgb_param = ROCforest.get_xgb_params()
    xgtrain = xgb.DMatrix(X_train.values, label=y_train.values)
    cvresult = xgb.cv(xgb_param,
                      xgtrain,
                      num_boost_round=ROCforest.get_params()['n_estimators'],
                      nfold=cv_folds,
                      metrics=eval_metric)

    ROCforest.set_params(n_estimators=cvresult.shape[0])

    ROCforest.fit(X_train, y_train)

    return ROCforest
Ejemplo n.º 11
0
class XGBoosting:
    def __init__(self, x_train, y_train, problemtype='regression', cv=5):
        self.x_train = x_train
        self.y_train = y_train
        self.cv = cv

        if problemtype == 'regression':
            self.clf = XGBRegressor()
        elif problemtype == 'classification':
            self.clf = XGBClassifier()

    def classify(self):
        self.clf.fit(self.x_train, self.y_train)

    def regress(self):
        self.clf.fit(self.x_train, self.y_train)

    def show_cross_val_score(self):
        cv_score = cross_val_score(estimator=self.clf,
                                   X=self.x_train,
                                   y=self.y_train,
                                   cv=self.cv,
                                   n_jobs=-1)
        print('XGB Cross Validated Score...')
        print(np.mean(cv_score))
        print('\n')

    def optimise(self):
        pass
Ejemplo n.º 12
0
def runXGBoost(x_train, y_train, x_test, y_test, p):

    # Here we instantiate the extra gradient boosting classifier
    clf = XGBClassifier()
    clf.set_params(**p)

    clf.fit(x_train, y_train)

    # now, make the predictions using our classifier
    xgb_predictions = clf.predict(x_test)

    # now we have to computer the classification accuracy
    # think about what two variables we have to compare
    xgb_score = accuracy_score(y_test, xgb_predictions)
    print("XGB classification accuracy on test data is " + str(xgb_score),
          file=sys.stderr)

    etc_predictions = clf.predict(x_test)
    dt_score = accuracy_score(y_test, etc_predictions)
    print("accuracy score on test data: " + str(dt_score), file=sys.stderr)
    train_score = accuracy_score(y_train, clf.predict(x_train))
    print("accuracy score on training data: " + str(train_score),
          file=sys.stderr)

    return (train_score, dt_score)
    def fit_model(self, X_train, y_train):

        print('Model Fitting started: ', datetime.now())

        start_train_date = pd.Timestamp(
            year=X_train['Year'].iloc[0],
            month=X_train['Month'].iloc[0],
            day=X_train['DayofMonth'].iloc[0]).date()

        end_train_date = pd.Timestamp(
            year=X_train['Year'].iloc[-1],
            month=X_train['Month'].iloc[-1],
            day=X_train['DayofMonth'].iloc[-1]).date()

        print('Fit model with data from {} to {}'.format(
            start_train_date, end_train_date))

        model_name = '{}_{}_{}_{}_{}'.format(self.strategy_name,
                                             start_train_date.year,
                                             start_train_date.month,
                                             end_train_date.year,
                                             end_train_date.month)

        start_time = time.time()

        classifier = XGBClassifier(n_jobs=8, n_estimators=1000, verbosity=1)
        classifier.fit(X_train, y_train)
        pickle.dump(classifier,
                    open("models/{}.pickle.dat".format(model_name), 'wb'))

        print('Duration Fitting: ', (time.time() - start_time))

        self.prediction_model = classifier
Ejemplo n.º 14
0
def job_function(params):
	learning_rate = params[0]
	max_depth = params[1]
	ss_cs = params[2]
	gamma = params[3]
	min_child_weight = params[4]
	reg_lambda = params[5]
	reg_alpha = params[6]

	early_stopping_rounds = 25
	if learning_rate >= 0.3:
		early_stopping_rounds = 5
	if learning_rate <= 0.03:
		early_stopping_rounds = 50

	scores = []
	for i in range(iterations_per_job):
		X_train = Xy[i][0]
		X_test = Xy[i][1]
		y_train = Xy[i][2]
		y_test = Xy[i][3]
		
		y_train2 = le.transform(y_train)   
		y_test2 = le.transform(y_test)   

		clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=5000, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha)      
		clf.fit(X_train, y_train, eval_set=[(X_test, y_test2)], eval_metric=calculate_score_2, early_stopping_rounds=early_stopping_rounds, verbose=False)
		y_predicted = clf.predict_proba(X_test, ntree_limit=clf.booster().best_ntree_limit)
		score = calculate_score(y_predicted, y_test2)
		scores.append(score)

	avg_score = np.array(scores).mean()
	print(avg_score, params)
	return avg_score
Ejemplo n.º 15
0
    def _distributor(self, label, cv, param, eval_metric, early_stopping_rounds=50):
        start = time()

        if self.is_classifier:
            label = 'XGBClassifier'
            rs = XGBClassifier(param)
        else:
            label = 'XGBRegressor'
            rs = XGBRegressor(param)

        X_visible, X_blind, y_visible, y_blined = \
            train_test_split(
                self.X_train, self.y_train, random_state=1301, stratify=self.y_train, test_size=0.4)

        rs.fit(self.X_visible, self.y_visible, eval_metric, early_stopping_rounds=50,
               eval_set=[(X_visible, y_visible), (X_blind, y_blined)])

        self.result[label] = {}
        self.result[label]['clf'] = rs
        # self.result[label]['score'] = rs.best_score_
        self.result[label]['time'] = time() - start
        # self.result[label]['set'] = ('n_iter: %s cv: %s' % (n_iter, cv))

        pprint.pprint(self.result[label])
        # pprint.pprint(rs.grid_scores_)

        out_result = open(self.result_address, 'wb')
        pickle.dump(self.result, out_result)
        out_result.close()
Ejemplo n.º 16
0
    def compute_cv_metric(split, cross_val_data, bayes_trials_results):

        #Create clasifier for cross validation results
        clf = XGBClassifier(random_state=0,
                            n_jobs=-1,
                            **bayes_trials_results[0]['params'])

        train_x = cross_val_data[split][0]
        train_y = cross_val_data[split][1]
        test_x = cross_val_data[split][2]
        test_y = cross_val_data[split][3]

        clf.fit(train_x, train_y)

        y_pred_cv = clf.predict(test_x)
        y_pred_prob_cv = clf.predict_proba(test_x)

        tn = confusion_matrix(test_y, y_pred_cv)[0, 0]
        tp = confusion_matrix(test_y, y_pred_cv)[1, 1]
        fp = confusion_matrix(test_y, y_pred_cv)[0, 1]
        fn = confusion_matrix(test_y, y_pred_cv)[1, 0]

        npv = tn / (tn + fn)
        specificity = tn / (tn + fp)

        precision = tp / (tp + fp)
        recall = tp / (tp + fn)

        roc_auc_cv = roc_auc_score(test_y, y_pred_prob_cv[:, 1])

        f1_cv = 2 * (precision * recall) / (precision + recall)

        return npv, specificity, precision, recall, roc_auc_cv, f1_cv, y_pred_prob_cv
def xxgboost(training, cv, testing):
    xgb = XGBClassifier(max_depth=6,
                        n_estimators=25,
                        objective='multi:softprob',
                        subsample=0.5,
                        colsample_bytree=0.5)

    xgb.fit(training, cv.ravel())
    XGBtrainscore = xgb.score(training, cv.ravel())  #Train Score

    kf = KFold(len(cv), n_folds=5)  # 5 folder cross validation
    scores = cross_val_score(xgb, training, cv.ravel(), cv=kf)
    XGBvalidation = abs(scores.mean())

    XGBy_pred = xgb.predict_proba(testing)

    le = LabelEncoder()
    y = le.fit_transform(labels)

    idlist = []  #id list
    listcty = []  #countries list

    for i in range(len(testid)):
        idi = testid[i]
        idlist += [idi] * 5
        listcty += le.inverse_transform(np.argsort(
            XGBy_pred[i])[::-1])[:5].tolist()

    XGBsub = pd.DataFrame(np.column_stack((idlist, listcty)),
                          columns=['id', 'country'])
    XGBsub.to_csv('XGsub_%s.csv' % csvname, index=False)
    print("XGBtrainscore", XGBtrainscore)
    print("XGBvalidation", XGBvalidation)
Ejemplo n.º 18
0
def xgboost_classifier(train_x, train_y):
    from xgboost.sklearn import XGBClassifier
    # model = XGBClassifier()
    model = XGBClassifier(silent=1,
                          learning_rate=0.1,
                          n_estimators=60,
                          max_depth=6,
                          min_child_weight=0.4,
                          gamma=0.5,
                          subsample=0.4,
                          colsample_bytree=1,
                          objective='binary:logistic',
                          nthread=4,
                          scale_pos_weight=1,
                          seed=1000)
    #max_depth=[2,3,4,5,6,7]
    #learning_rate = [0.01,0.05,0.1,0.2,0.4,0.8,1]
    #n_estimators = [30,60, 80, 100, 150, 200]
    #param_grid = dict()
    #kfold = StratifiedKFold(n_splits=10,shuffle=True,random_state=7)
    #grid_search = GridSearchCV(model,param_grid,scoring='neg_log_loss',n_jobs=-1,cv=kfold)
    #grid_result = grid_search.fit(np.array(train_x), np.array(train_y))
    #print grid_result.best_score_,'***********',grid_result.best_params_

    model.fit(train_x, train_y)
    #
    # from xgboost import plot_importance
    # from matplotlib import pyplot
    # plot_importance(model)
    # pyplot.show()

    return model
Ejemplo n.º 19
0
def XGBoost(returns, factRet):
    """
    :param return:
    :param facret:
    :param a: lambda
    """
    [timeN, factorN] = factRet.shape
    [timeN, assetN] = returns.shape
    f_bar = []
    for i in range(factorN):
        f_bar.append(np.prod(factRet.iloc[:, i] + 1)**(1 / timeN) - 1)

    colName = list(factRet.columns)
    f_bar = pd.DataFrame(f_bar).T
    f_bar.columns = colName
    xgb = XGBClassifier(learning_rate=0.1,
                        n_estimators=10,
                        max_depth=7,
                        min_child_weight=2,
                        gamma=0.2,
                        subsample=0.8,
                        colsample_bytree=0.6,
                        objective='reg:linear',
                        scale_pos_weight=1,
                        seed=10)
    mu = []
    for i in range(assetN):
        xgb.fit(factRet, returns.iloc[:, i])
        mu.append(float(xgb.predict(f_bar)))
    mu = np.array(mu)
    Q = np.array(returns.cov())
    return mu, Q
Ejemplo n.º 20
0
def modelfit(useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    alg = XGBClassifier(**params)
    df = data.sample(frac=0.3)
    pX = df.drop('LABEL', axis=1)
    py = df['LABEL']
    if useTrainCV:
        print("start use cv")
        xgb_param = alg.get_xgb_params()
        cvresult = xgb.cv(xgb_param,
                          xgtrain,
                          num_boost_round=xgb_param['n_estimators'],
                          nfold=cv_folds,
                          metrics='auc',
                          early_stopping_rounds=early_stopping_rounds)
        print(cvresult.shape[0])
        alg.set_params(n_estimators=cvresult.shape[0])
        params['n_estimators'] = cvresult.shape[0]
        print("best tree size is {}".format(cvresult.shape[0]))
    # Fit the algorithm on the data
    alg.fit(X, y, eval_metric='auc')
    y_pred = alg.predict(pX)
    accuracy = metrics.accuracy_score(py, y_pred)
    print("精确率Accuracy: %.2f%%" % (accuracy * 100.0))
    print('auc:', metrics.roc_auc_score(py, y_pred))
    train_report = metrics.classification_report(py, y_pred)
    print(train_report)
    feat_imp = pd.Series(
        alg.get_booster().get_fscore()).sort_values(ascending=False)
    print(feat_imp)
    return alg
Ejemplo n.º 21
0
def xgb_no_feature_select(train: pd.DataFrame,
                          test: pd.DataFrame,
                          y_train,
                          cv=False):
    params = {
        'silent': 1,
        'nthread': 4,
        'eval_metric': 'auc',
        'verbose_eval': True,
        'seed': 918,
        'alpha': 9.6523,
        'cosample_bytree': 0.9604,
        'eta': 0.1171,
        'gamma': 0.179,
        'max_depth': 7,
        'min_child_weight': 13,
        'subsample': 0.9609
    }
    xgtrain = xgb.DMatrix(train, label=y_train)
    if cv:
        cv_res = xgb_k_folder_cv(params, xgtrain)
        print(cv_res)
    model = XGBClassifier(**params)
    model.fit(train, y_train)
    y_predict = model.predict_proba(test)
    return model, y_predict
Ejemplo n.º 22
0
 def fit_model_split(self, X_train, y_train, X_test, y_test, x_pre):
     # X_train_1用于生成模型  X_train_2用于和新特征组成新训练集合
     X_train_1, X_train_2, y_train_1, y_train_2 = train_test_split(
         X_train, y_train, test_size=0.6, random_state=0)
     clf = XGBClassifier(learning_rate=self.learning_rate,
                         n_estimators=self.n_estimators,
                         max_depth=self.max_depth,
                         min_child_weight=self.min_child_weight,
                         gamma=self.gamma,
                         subsample=self.subsample,
                         colsample_bytree=self.colsample_bytree,
                         objective=self.objective,
                         nthread=self.nthread,
                         scale_pos_weight=self.scale_pos_weight,
                         reg_alpha=self.reg_alpha,
                         reg_lambda=self.reg_lambda,
                         seed=self.seed)
     clf.fit(X_train_1, y_train_1)
     y_pre = clf.predict(X_train_2)
     y_pro = clf.predict_proba(X_train_2)[:, 1]
     print "pred_leaf=T AUC Score : %f" % metrics.roc_auc_score(
         y_train_2, y_pro)
     print "pred_leaf=T  Accuracy : %.4g" % metrics.accuracy_score(
         y_train_2, y_pre)
     new_feature = clf.apply(X_train_2)
     X_train_new2 = self.mergeToOne(X_train_2, new_feature)
     new_feature_test = clf.apply(X_test)
     X_test_new = self.mergeToOne(X_test, new_feature_test)
     new_feature_pre = clf.apply(x_pre)
     X_pre_new = self.mergeToOne(x_pre, new_feature_pre)
     print "Training set of sample size 0.4 fewer than before"
     return X_train_new2, y_train_2, X_test_new, y_test, X_pre_new
Ejemplo n.º 23
0
class Classifier(object):
    def __init__(self, conf, task, train=None, test=None):
        self.conf = conf
        self.task = task
        self.train_ = train
        self.test_ = test
        self.features = [
            "hasWith", "hasIn", "simiBucket", "textPos", "hasOf", "hasAnd",
            "startEntity", "distance", "hasFrom", "endEntity", "similarity",
            "hasThan", "hasVerb"
        ]
        self.labels = ["relation"]
        self.num_round = 500
        self.eval_set = list()
        self.early_stopping_rounds = 20
        self.classifier = XGBClassifier(max_depth=4,
                                        learning_rate=0.1,
                                        n_estimators=1000,
                                        gamma=4,
                                        verbosity=1,
                                        objective='multi:softmax',
                                        num_class=6,
                                        booster='gbtree',
                                        n_jobs=4,
                                        seed=27)

    def train(self):
        train_X, test_X, train_y, test_y = train_test_split(
            self.train_[self.features],
            self.train_[self.labels],
            test_size=0.4,
            random_state=42)

        self.eval_set = [(train_X.values, train_y.values),
                         (test_X.values, test_y.values)]
        self.classifier.fit(train_X.values,
                            train_y.values,
                            eval_metric='merror',
                            eval_set=self.eval_set,
                            early_stopping_rounds=self.early_stopping_rounds,
                            verbose=True)

        self.classifier.save_model(self.conf.model_path.format(self.task))
        return 'Model has been saved!'

    def test(self):
        test_set = self.test_[self.features].values
        self.classifier.load_model(self.conf.model_path.format(self.task))
        self.classifier._le = LabelEncoder().fit([
            'USAGE', 'TOPIC', 'MODEL-FEATURE', 'PART_WHOLE', 'RESULT',
            'COMPARE'
        ])
        pred = self.classifier.predict(test_set)
        predictions = pd.concat([
            self.test_[self.features],
            pd.DataFrame(pred, columns=["relation"])
        ],
                                axis=1)

        return predictions
Ejemplo n.º 24
0
def train():

    trainDf = pd.read_csv("data_train.csv")
    testDf = pd.read_csv("data_test.csv")
    goal = "interested"
    predictors = [
        "invited", "user_reco", "evt_p_reco", "evt_c_reco", "user_pop",
        "frnd_infl", "evt_pop"
    ]

    clf = XGBClassifier(learning_rate=0.1,
                        n_estimators=1000,
                        max_depth=5,
                        min_child_weight=1,
                        gamma=0,
                        subsample=0.8,
                        colsample_bytree=0.8,
                        objective='binary:logistic',
                        nthread=4,
                        scale_pos_weight=1,
                        seed=27)

    X_train, X_test, y_train, y_test = train_test_split(trainDf[predictors],
                                                        trainDf[goal],
                                                        random_state=0)
    clf.fit(X_train,
            y_train,
            early_stopping_rounds=10,
            eval_metric="auc",
            eval_set=[(X_test, y_test)])
    return clf
Ejemplo n.º 25
0
def XGB_class_evaluation(individual):
    N_SPLITS = N_splits
    kf = KFold(n_splits=N_SPLITS)
    fc = XGBClassifier(learning_rate=individual[0],
                       n_estimators=individual[5],
                       silent=True,
                       nthread=-1,
                       gamma=0,
                       min_child_weight=individual[1],
                       max_depth=individual[2],
                       subsample=individual[3],
                       colsample_bylevel=individual[4],
                       seed=0)
    M_pos = 0
    M_mid = 0
    M_neg = 0
    for train, test in kf.split(trainX):
        fc.fit(trainX[train, :], trainY[train])
        testY_pre = fc.predict(trainX[test, :])
        Ind_pos = (trainY[test] == 1)
        Ind_mid = (trainY[test] == 0)
        Ind_neg = (trainY[test] == -1)
        M_pos += len(np.where(np.array(testY_pre[Ind_pos]) == 1)[0]) / len(
            np.where(Ind_pos)[0])
        M_mid += len(np.where(np.array(testY_pre[Ind_mid]) == 0)[0]) / len(
            np.where(Ind_mid)[0])
        M_neg += len(np.where(np.array(testY_pre[Ind_neg]) == -1)[0]) / len(
            np.where(Ind_neg)[0])

    correct = map(lambda x: x / N_SPLITS, [M_pos, M_mid, M_neg])
    return (tuple(correct))
Ejemplo n.º 26
0
def train_model(mall_id):
    # 开始训练模型
    random_state = 10
    metrix, tar = utils.get_data(mall_id)
    x_train, x_test, y_train, y_test = train_test_split(
        metrix, tar, test_size=0.1, random_state=random_state)
    # xgboost方法,基于boosting tree(提升树方法)
    # 设参数 训练慢
    clf_name = "xgboost"
    save_dir = "./model/" + clf_name + "_" + mall_id + "_model.m"
    n_est = 50
    clf = XGBClassifier(
        learning_rate=0.1,  # 学习率 典型值为0.01-0.2
        n_estimators=n_est,
        max_depth=5,  # 树的最大深度 一般3-10
        min_child_weight=1,  # 决定最小叶子节点样本权重和 值较大,避免过拟合 值过高,会导致欠拟合
        gamma=0,  # 指定了节点分裂所需的最小损失函数下降值。 这个参数的值越大,算法越保守
        subsample=0.8,  # 对于每棵树,随机采样的比例 减小,算法保守,避免过拟合。值设置得过小,它会导致欠拟合 典型值:0.5-1
        colsample_bytree=0.8,  # 每棵随机采样的列数的占比
        objective='binary:logistic',  # 使用二分类
        nthread=4,  # 线程数
        scale_pos_weight=1,  # 在各类别样本十分不平衡时,参数设定为一个正值,可以使算法更快收敛
        seed=0)  # 随机数的种子 设置它可以复现随机数据的结果
    print(utils.get_time(), ' ', mall_id, ' starts...')
    train_time = time.time()
    clf.fit(x_train, y_train)
    train_time = time.time() - train_time
    score = clf.score(x_test, y_test)
    joblib.dump(clf, save_dir)
    print(utils.get_time(), ' saved a model for ', mall_id, ' score: ', score,
          '  train time : ', train_time)
    train_time = int(train_time)
    return (score, n_est, train_time)
Ejemplo n.º 27
0
def xgb_result(x, y, testx, testy, para):
    print("----- Working on 'xgb' method...")
    #dtrain = xgb.DMatrix(x, label=y)
    #dtest = xgb.DMatrix(testx, label=testy)
    xgb0 = XGBClassifier(**para)
    #    with open('xgb.pickle','rb') as f:
    #        xgb0 = pickle.load(f)
    time0 = time.time()
    #bst = xgb.train(dtrain=dtrain,**para)
    xgb0.fit(x, y)
    train_time = time.time() - time0
    confusion, test_time = Errmodel(xgb0,
                                    x,
                                    y,
                                    testx,
                                    testy,
                                    ntree_limit=xgb0.booster().best_iteration)
    print(confusion, '\n', train_time, '\n', test_time)
    importance = sorted(xgb0.booster().get_score().items(), key=lambda x: x[1])
    result = {
        'model': xgb0,
        'confusion': confusion,
        'train_time': train_time,
        'test_time': test_time,
        'importance': importance,
        'best_iter': xgb0.booster().best_iteration
    }
    print("best_iter", xgb0.booster().best_iteration)
    return result
Ejemplo n.º 28
0
def train_classify(X_train, y_train):
    """
    使用XGBoostClassifier
    :param X_train:
    :param y_train:
    :return:
    """
    print("正在使用XGBoostClassifier训练")
    model = XGBClassifier(
        learning_rate=0.1,
        n_estimators=80,  # 树的个数--1000棵树建立xgboost
        max_depth=6,  # 树的深度
        min_child_weight=1,  # 叶子节点最小权重
        gamma=0.,  # 惩罚项中叶子结点个数前的参数
        subsample=0.8,  # 随机选择80%样本建立决策树
        colsample_btree=0.8,  # 随机选择80%特征建立决策树
        objective='multi:softmax',  # 指定损失函数
        scale_pos_weight=1,  # 解决样本个数不平衡的问题
        random_state=27  # 随机数
    )

    model.fit(X_train,
              y_train,
              eval_set=[(X_train, y_train)],
              eval_metric="mlogloss",
              early_stopping_rounds=10,
              verbose=True)

    return model
Ejemplo n.º 29
0
def sampleTrain():
    LABEL = 'LABEL'
    dpath = 'dan_train_{}.csv'
    data4 = pd.read_csv(dpath.format(201804),index_col=ID_COLUMN)
    data5 = pd.read_csv(dpath.format(201805),index_col=ID_COLUMN)
    a1 = data4[data4.LABEL == 1]
    del data4
    b1 = data5[data5.LABEL == 1]
    b0 = data5[data5.LABEL == 1].sample(n=(a1.shape[0]+b1.shape[0])*35)
    del data5
    data = b0.append(a1).append(b1).sort_index()
    X = data.drop(columns=LABEL)
    y = data[LABEL]
    params = {'learning_rate': 0.01,
     'n_estimators': 1000,
     'max_depth': 8,
     'min_child_weight': 0,
     'gamma': 0.4,
     'subsample': 0.9,
     'colsample_bytree': 0.6,
     'scale_pos_weight': 10,
     'n_jobs': 50,
     'objective': 'binary:logistic',
     'reg_alpha': 1,
     'reg_lambda': 1}
    model = XGBClassifier(**params)
    model.fit(X, y, eval_metric=metrics.f1_score)
    del X
    del y
    del data
    joblib.dump(model, 'CDanCdmaModel_{}.pkl'.format(format(datetime.now().strftime('%d%H%M'))))
    data, X_test, y_test = get_transformed_data(month='201806', frac=1)
    print_evaluate(model, X_test, y_test)
Ejemplo n.º 30
0
def modeling_RF():
    estimator = None
    try:
        df1 = pd.read_csv('last_total.csv', encoding='cp949')

        df_dummy = pd.get_dummies(df1)
        train, test = train_test_split(df_dummy,
                                       test_size=0.2,
                                       random_state=1234)
        train_x = train.drop('target_bool', axis=1)
        train_y = train['target_bool']
        test_x = test.drop('target_bool', axis=1)
        test_y = test['target_bool']

        xgb = XGBClassifier(random_state=1234,
                            learning_rate=0.6000000000000001,
                            max_depth=9,
                            n_estimators=200)
        xgb.fit(train_x, train_y)
        abc = xgb.score(train_x, train_y)

    except Exception as e:
        print(e)
    finally:
        pass

    return abc
Ejemplo n.º 31
0
    def pred(self, X):
        """
        Computes the Xgboost and gradient boost predictions for given data.
        :param X: pre-processed data
        :return: None
        """
        Y = X['isFraud']
        X = X.drop(['nameOrig', 'nameDest', 'isFlaggedFraud', 'isFraud'], axis=1)
        # hot-encoding of transaction type'
        X.loc[X.type == 'TRANSFER', 'type'] = 0
        X.loc[X.type == 'CASH_OUT', 'type'] = 1
        X.type = X.type.astype(int)
        x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
        wts = sum((Y == 0)) / sum(1.0 * (Y == 1))
        # Grid search -- checking of best params
        # uncomment to compute params
        # print("Grid searching....")
        # self.param_tuning(x_train, y_train, wts)

        clf = XGBClassifier(max_depth=1, gamma=0.1, scale_pos_weight=wts, n_jobs=4)
        print("-----------------------------TRAINING XGBOOST------------------------------------")
        probs = clf.fit(x_train, y_train).predict_proba(x_test)
        probY = probs[:, 1]
        self.plot_roc(y_test, probY)

        print("-----------------------------TRAINING Gradient Boosting------------------------------------")
        clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
        probs = clf.fit(x_train, y_train).predict(x_test)
        self.plot_roc(y_test, probs)
Ejemplo n.º 32
0
def train_model(train, test):
    x_train, y_train = get_fea_lab(train)
    x_test, y_test = get_fea_lab(test)
    xgb = XGBClassifier()
    print(xgb)

    paras = {
        'max_depth': range(1, 3),
        'min_child_weight': [i / 10 for i in range(0, 10)],
        'scale_pos_weight': range(10, 100, 10)
    }
    gscv = GridSearchCV(estimator=xgb,
                        param_grid=paras,
                        cv=5,
                        scoring='roc_auc')
    gscv.fit(x_train, y_train)
    print(gscv.best_params_)
    print(gscv.best_score_)
    print(gscv.score(x_test, y_test))
    result = gscv.predict(x_test)
    print(confusion_matrix(y_test, result))
    print(classification_report(y_test, result))

    xgb.fit(x_train, y_train)
    test_result = xgb.predict(x_test)
    print(confusion_matrix(y_test, test_result))
    print(classification_report(y_test, test_result))
    '''
Ejemplo n.º 33
0
    def get_leaf(self):
        self.get_data("oneHot")
        n_estimators = 300
        clf_xgb = XGBClassifier(max_depth=4,
                                learning_rate=0.0125,
                                n_estimators=300,
                                subsample=0.6,
                                colsample_bytree=0.7,
                                seed=4)
        #clf_xgb = XGBClassifier(max_depth=4, n_estimators=300)
        clf_xgb.fit(self.x_train, self.y_train)

        leafes_train = list(clf_xgb.apply(self.x_train))
        leafes_test = list(clf_xgb.apply(self.x_test))

        #补充最大值,最小值,将数据one-hot时统一
        max_train = np.array(leafes_train).max()
        min_train = np.array(leafes_train).min()

        max_test = np.array(leafes_test).max()
        min_test = np.array(leafes_test).min()
        max_value = max(max_train, max_test)
        min_value = min(min_train, min_test)
        for i in range(min_value, max_value + 1):
            leafes_train.append([i] * n_estimators)

        enc = OneHotEncoder()
        enc.fit(leafes_train)
        #去除补充的值
        leafes_train_feature = enc.transform(
            leafes_train).toarray()[:-(max_value - min_value + 1), :]
        print leafes_train_feature.shape, len(leafes_train)
        return leafes_train_feature, self.y_train, enc.transform(
            leafes_test).toarray(), self.y_test
def extract_leaf_feature(features, targets, train_indexes, params):
    model = XGBClassifier(**params)
    model.fit(features[train_indexes], targets[train_indexes])
    booster = model.booster()
    dmatrix = xgb.DMatrix(features)
    leaf = booster.predict(dmatrix, pred_leaf=True)
    encoder = sklearn.preprocessing.OneHotEncoder()
    leaf_feature = encoder.fit_transform(leaf)
    return leaf_feature
def main(training_data, test_data):
    # Merging data to ensure consistent cleaning. Putting marker variable to separate later.
    training_data['source'] = 'training'
    test_data['source'] = 'test'
    merged_data = pd.concat([training_data, test_data])

    # Cleaning data
    cleaned_data = data_cleaner(merged_data)

    # Separating data, removing marker
    pred_df = cleaned_data[cleaned_data['source'] == 'training'].copy()
    test_pred = cleaned_data[cleaned_data['source'] == 'test'].copy()

    pred_df.drop('source', axis=1, inplace=True)
    test_pred.drop('source', axis=1, inplace=True)

    # Transforming target into ints, saving the key for later transformation
    labels = LabelEncoder().fit(training_data['country_destination'])
    target_df = pd.Series(labels.transform(training_data['country_destination']), index=training_data.index)

    # Training model
    xgb_model = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='multi:softprob',
                              subsample=0.5, colsample_bytree=0.5, seed=0)
    xgb_model.fit(pred_df.as_matrix(), target_df.tolist())

    # Running the model
    preds = xgb_model.predict_proba(test_pred.as_matrix())

    # Selecting the top 5 most likely for each respondent and stacking. 
    # This section is VERY slow and could use being optimized
    model_probs = pd.DataFrame(preds, index=test_pred.index, columns=labels.classes_)

    stacked_probs = pd.Series()
    for i in model_probs.index:
        temp = model_probs.loc[i, :]
        temp_sort = pd.DataFrame(temp.sort_values(ascending=False)[:5].index)

        temp_sort['id'] = i
        temp_sort.columns = ['country', 'id']

        stacked_probs = pd.concat([stacked_probs, temp_sort])

    # # Selecting classes with highest probabilities, compiling into list
    # ids = []
    # cts = []
    # test_ids = pd.Series(test_data.index)
    # for i in range(len(test_ids)):
    #     idx = test_data.index[i]
    #     ids += [idx] * 5
    #     cts += labels.inverse_transform(np.argsort(model_probs[i])[::-1])[:5].tolist()
    #
    # predictions = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])

    # Cleaning output and returning it
    output = stacked_probs[['id', 'country']]
    return output
def main():
    data_train = pd.read_csv(args.train_dataset)
    X_train = data_train.drop(['Id', 'Class'], axis=1)
    y_train = data_train.loc[:, 'Class']
    data_test = pd.read_csv(args.test_dataset)
    X_test = data_test.drop(['Id'], axis=1)
    Id = data_test.loc[:, 'Id']
    clf = XGBClassifier()
    clf.set_params(**best_dicts)
    clf.fit(X_train, y_train)
    prediction = clf.predict_proba(X_test)
    columns = ['Prediction'+str(i) for i in range(1, 10)]
    prediction = pd.DataFrame(prediction, columns=columns)
    results = pd.concat([Id, prediction], axis=1)
    return (clf, results)
Ejemplo n.º 37
0
def objective(space):

    clf = XGBClassifier(n_estimators=int(space['n_estimators']),
                        objective='binary:logistic',
                        seed=37,
                        learning_rate=space['learning_rate'],
                        max_depth=space['max_depth'],
                        min_child_weight=space['min_child_weight'],
                        colsample_bytree=space['colsample_bytree'],
                        subsample=space['subsample'])

    clf.fit(xTrain, yTrain, eval_metric="logloss")
    pred = clf.predict_proba(xValid)[:, 1]
    loss = log_loss(yValid, pred)
    return{'loss': loss, 'status': STATUS_OK}
Ejemplo n.º 38
0
def myThreadFunc(ThreadID):
	X_train = Xy[ThreadID][0]
	X_test = Xy[ThreadID][1]
	y_train = Xy[ThreadID][2]
	y_test = Xy[ThreadID][3]
		
	y_train2 = le.transform(y_train)   
	y_test2 = le.transform(y_test)   

	clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=5000, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha)      
	clf.fit(X_train, y_train, eval_set=[(X_test, y_test2)], eval_metric=calculate_score_2, early_stopping_rounds=early_stopping_rounds, verbose=False)
	y_predicted = clf.predict_proba(X_test, ntree_limit=clf.booster().best_ntree_limit)
	score = calculate_score(y_predicted, y_test2)
	print(score, clf.booster().best_ntree_limit)
	
	train_and_test_scores[ThreadID] = score
Ejemplo n.º 39
0
def apply_xgb_ens(y_valid, valid_folder='Valid', test_folder='Test'):
    """
    Ensembler based on xgboost Gradient boosting.
    """
    #Loading data
    X, X_test, n_preds, n_class = get_X_X_Test(valid_folder, test_folder)
    y = y_valid
    
    #Defining classifier
    xgb = XGBClassifier(max_depth=4, learning_rate=0.05, n_estimators=200,
                        objective='multi:softprob', gamma=0., 
                        max_delta_step=0., subsample=0.9, colsample_bytree=0.9,
                        seed=0)  
    xgb.fit(X, y)   
    y_pred = xgb.predict_proba(X_test)
    return y_pred      
    
    
    
def perform_prediction(training, labels, testing, xgb_votes, rf_votes):
    """ Perform prediction using a combination of XGB and RandomForests. """
    predictions = np.zeros((len(testing), len(set(labels))))
    # Predictions using xgboost.
    for i in range(xgb_votes):
        print 'XGB vote %d' % i
        xgb = XGBClassifier(
            max_depth=DEPTH_XGB, learning_rate=LEARNING_XGB,
            n_estimators=ESTIMATORS_XGB, objective='multi:softprob',
            subsample=SUBSAMPLE_XGB, colsample_bytree=COLSAMPLE_XGB)
        xgb.fit(training, labels)
        predictions += xgb.predict_proba(testing)
    # Predictions using RandomForestClassifier.
    for i in range(rf_votes):
        print 'RandomForest vote %d' % i
        rand_forest = RandomForestClassifier(
            n_estimators=ESTIMATORS_RF, criterion=CRITERION_RF, n_jobs=JOBS_RF,
            max_depth=DEPTH_RF, min_samples_leaf=MIN_LEAF_RF, bootstrap=True)
        rand_forest.fit(training, labels)
        predictions += rand_forest.predict_proba(testing)
    return predictions
Ejemplo n.º 41
0
def xgboostinitial_predictor(train_path, test_path, eval_path):
    # Loading the data
    print 'Loading the data...'
    train = pd.read_csv(train_path, index_col=0)
    test = pd.read_csv(test_path, index_col=0)
    eval_df = pd.read_csv(eval_path, index_col=0)
    target = train['target'].copy()
    train.drop('target', axis=1, inplace=True)

    # Training model
    print 'Model training begins...'
    # xgtrain = xgb.DMatrix(train.values, target.values, missing=np.nan)
    # xgboost_params = {'objective': 'binary:logistic', 'booster': 'gbtree', 'eval_metric': 'logloss', 'eta': 0.01,
    #                   'subsample': 0.5, 'colsample_bytree': 0.5, 'max_depth': 10, 'silent': 0}
    #
    # xgb_model = xgb.train(xgboost_params, xgtrain, learning_rates=0.3)

    xgb_model = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='binary:logistic',
                              subsample=0.5, colsample_bytree=0.5, seed=0)
    xgb_model.fit(train.as_matrix(), target.tolist())

    # Running the model
    print 'Making predictions....'
    # xgtest = xgb.DMatrix(test.values)
    # xgeval = xgb.DMatrix(eval_df)

    test_preds = xgb_model.predict_proba(test.as_matrix())
    eval_preds = xgb_model.predict_proba(eval_df.as_matrix())

    print 'Cleaning predictions to match expected format....'
    test_output = pd.DataFrame(test_preds, index=test.index)
    print test_output.columns
    test_output = test_output[1]
    test_output.columns = ['PredictedProb']

    eval_output = pd.DataFrame(eval_preds, index=eval_df.index)
    eval_output = eval_output[1]
    eval_output.columns = ['PredictedProb']

    return test_output, eval_output
def train_classifier(X, y, clf_name='xgb'):
    if clf_name == 'xgb':
        clf = XGBClassifier(
            n_estimators=ESTIMATORS_XG,
            objective=OBJECTIVE_XG,
            max_depth=DEPTH_XG,
            learning_rate=LEARNING_RATE_XG,
            subsample=SUBSAMPLE_XG,
            colsample_bytree=COLSAMPLE_BYTREE_XG,
            seed=0,
        )
    else:
        clf = RandomForestClassifier(
            n_estimators=ESTIMATORS_RF,
            criterion=CRITERION_RF,
            n_jobs=JOBS_RF,
            max_depth=DEPTH_RF,
            min_samples_leaf=MIN_LEAF_RF,
            min_samples_split=MIN_SPLIT_RF,
            max_features=MAX_FEATURES_RF,
            bootstrap=True,
        )
    clf.fit(X, y)
    return clf
Ejemplo n.º 43
0
def get_xgboost_classifier(X_train, y_train, X_val, y_val,params=None, tag=""):
    
    param_grid = {'max_depth':[3,5,7], 'min_child_weight': [1,3,5], 'n_estimators': [50]}
    
    if params is None:
        xgb = XGBClassifier(
                 learning_rate =0.2,
                 objective= 'binary:logistic',
                 seed=27)
                 
        t = start("training xgboost ")
        cv = cross_validation.ShuffleSplit(X_train.shape[0], n_iter=10,test_size=0.2, random_state=123)
        clf = grid_search.GridSearchCV(xgb, param_grid, cv=cv, n_jobs=1, scoring='roc_auc')
        clf = clf.fit(X_train,y_train)
        report(t, nitems=10*len(param_grid))
        
        print("Best score:{} with scorer {}".format(clf.best_score_, clf.scorer_))
        print "With parameters:"
    
        best_parameters = clf.best_estimator_.get_params()
        for param_name in sorted(param_grid.keys()):
            print '\t%s: %r' % (param_name, best_parameters[param_name]) 
    else:
        clf = XGBClassifier(**params)
        clf.fit(X_train, y_train, eval_set =  [(X_train,y_train),(X_val,y_val)], eval_metric='auc', verbose=False)
        
        if plot_cv_curves:
            train = clf.evals_result()['validation_0']['auc']
            val = clf.evals_result()['validation_1']['auc']
        
            plot_cv_curve(train, val, tag)
        
        if plot_feature_importance:
            plot_feature_importance(clf, tag)

    return clf
    plt.xlabel('Predicted label')


#define X y
X, y = data.loc[:,data.columns != 'state'].values, data.loc[:,data.columns == 'state'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

#ClusterCentroids
cc = ClusterCentroids(random_state=0)
os_X,os_y = cc.fit_sample(X_train,y_train)

#XGboost
clf_XG = XGBClassifier(learning_rate= 0.3, min_child_weight=1,
                       max_depth=6,gamma=0,subsample=1, max_delta_step=0, colsample_bytree=1,
                       reg_lambda=1, n_estimators=100, seed=1000, scale_pos_weight=1000)  
clf_XG.fit(os_X, os_y,eval_set=[(os_X, os_y), (X_test, y_test)],eval_metric='auc',verbose=False)  
evals_result = clf_XG.evals_result()  
y_true, y_pred = y_test, clf_XG.predict(X_test)  

#F1_score, precision, recall, specifity, G score
print "F1_score : %.4g" % metrics.f1_score(y_true, y_pred)  
print "Recall : %.4g" % metrics.recall_score(y_true, y_pred)
recall = metrics.recall_score(y_true, y_pred)  
print "Precision : %.4g" % metrics.precision_score(y_true, y_pred)
 
#Compute confusion matrix
cnf_matrix = confusion_matrix(y_test,y_pred)
np.set_printoptions(precision=2)
print "Specifity: " , float(cnf_matrix[0,0])/(cnf_matrix[0,0]+cnf_matrix[0,1])
specifity = float(cnf_matrix[0,0])/(cnf_matrix[0,0]+cnf_matrix[0,1]) 
print "G score: " , math.sqrt(recall/ specifity) 
Ejemplo n.º 45
0
#Splitting train and test
vals = df_all.values
X = vals[:piv_train]
le = LabelEncoder()
y = le.fit_transform(labels)   
X_test = vals[piv_train:]


# In[ ]:


#Classifier
xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)                  
xgb.fit(X, y)
y_pred = xgb.predict_proba(X_test)  


# In[ ]:

ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

#Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('sub0.csv',index=False)
Ejemplo n.º 46
0
#  reg_alpha=0.1,
#  seed=27)
# modelfit(xgb1, df_train, predictors, targetname, early_stopping_rounds=50)


xgb1 = XGBClassifier(
 learning_rate=0.01,
 n_estimators=700,
 max_depth=5,
 min_child_weight=8,
 gamma=0.3,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 scale_pos_weight=1,
 seed=27)



xgb1.fit(df_train[predictors], df_train[targetname])
df_test['target'] = xgb1.predict(df_test[predictors])




df_test['target'] = df_test['target'].apply(lambda x: 'Y' if x==1 else 'N')

submission = pd.DataFrame()
submission['Loan_ID'] = df_test['Loan_ID']
submission['Loan_Status'] = df_test['target']
submission.to_csv('submission_XGB_retunned.csv', index=False)
Ejemplo n.º 47
0
    "signup_app",
    "first_device_type",
    "first_browser",
]
X = split_categorical_variables(train, categorical_variables)
y = X.pop("country_destination")
label_table = LabelEncoder()
y = label_table.fit_transform(y.values)


# # Let's try a gradiant boost classifier

# In[56]:

xgb_model = XGBClassifier(max_depth=3, n_estimators=10, learning_rate=0.1)
xgb_model.fit(X, y)


# ## How did we do?
#
# * To start, let's look at how well we did just predicting the final outcome


pred = xgb_model.predict_proba(X)

# Find the most probable country
best_country = []  # Not used for now
bestId = []
for i in range(len(pred)):
    bestId.append(np.argsort(pred[i])[::-1])
    best_country.append(label_table.inverse_transform(bestId[-1]))
Ejemplo n.º 48
0
	for iter in range(iterations):
#		if iter < 5:
#			continue
		X_train = Xy[iter][0]
		X_test = Xy[iter][1]
		y_train = Xy[iter][2]
		y_test = Xy[iter][3]

		y_train2 = le.transform(y_train)   
		y_test2 = le.transform(y_test)   

		print('fit start', datetime.now())

		clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha, nthread=nthread)      
		clf.fit(X_train, y_train, eval_set=[(X_test, y_test2)], eval_metric=calculate_score_2)

submit = 0
if submit == 1:
#	n_estimators = 395
	n_estimators = 349
	#n_estimators = clf.booster().best_ntree_limit 
	print(n_estimators)

	print('fit start', datetime.now())
	clf2 = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha, nthread=nthread)      
	clf2.fit(X, y)
	#clf2.fit(X, y, eval_set=[(X, y2)], eval_metric=calculate_score_dummy, early_stopping_rounds=n_estimators)

	y_predicted = clf2.predict_proba(X_predict)  
Ejemplo n.º 49
0
X_va = data_valid.values
y_va = y_valid

model = XGBClassifier(n_estimators=1,
                      learning_rate=0.1,
                      max_depth=1000,
                      min_child_weight=1000,
                      reg_lambda=0,
                      seed=12)


for cb in [0.1, 1.]:
    print('\ncolsample_bytree: %.1f' % cb)
    model.colsample_bylevel = cb
    model.fit(X_tr, y_tr, eval_set=[(X_tr, y_tr), (X_va, y_va)],
              eval_metric='auc', verbose=True)

y_train=y_train.astype(int)

n = data_train.shape[0]
n = 327690
dtrain = xgb.DMatrix(data_train.values[:n], label = y_train[:n])

param2 = {'objective':'binary:logistic','tree_method':'approx', 'sketch_eps':0.00392,
         'eta':.1, 'min_child_weight':10, 'max_depth':10, 'lambda':0,
         'eval_metric':['logloss','auc'],
         'nthread':2, 'seed':123, 'silent':1}

param2 = {'objective':'binary:logistic',
         'eta':.1, 'max_depth':10,# 'lambda':0,
         'eval_metric':['logloss','auc'],
Ejemplo n.º 50
0
num_rounds=206
z=[]
dtrain=xgb.DMatrix(train[features],label=y)
clf=xgb.train(params,dtrain,num_rounds)

importance=clf.get_fscore(fmap='xgb.fmap')
importance=sorted(importance.items(),key=operator.itemgetter(1))
df = pd.DataFrame(importance, columns=['feature', 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()

bst=list(df['feature'][df.fscore>0.001])
#df.to_csv('select.csv',index=False)
X_train,X_valid,y_train,y_valid=train_test_split(train[bst],y,test_size=0.6,random_state=10)
print ('start xgboost learning...')
alg = XGBClassifier(max_depth=6, learning_rate=0.05, n_estimators=1210, objective='multi:softprob', subsample=0.8, colsample_bytree=1,min_child_weight=1)                    
alg.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)],eval_metric='mlogloss',early_stopping_rounds=10,verbose=True)


#plt.figure()
#df.plot()
#df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10))
#plt.title('XGBoost Feature Importance')
#plt.xlabel('relative importance')
#plt.gcf().savefig('feature_importance_xgb.png')
y_pred = alg.predict_proba(test[bst])
result=pd.DataFrame(y_pred,columns=['predict_0','predict_1','predict_2'])
result['id']=test.id.values.copy()
#result.to_csv('xgb10.csv',index=False)


Ejemplo n.º 51
0
def build_model(X, y):
    print("Fitting classifier")
    xgb = XGBClassifier(max_depth = 4, learning_rate = 0.25, n_estimators = 25,
                            objective = 'multi:softprob', subsample = 0.6, colsample_bytree = 0.6)
    xgb.fit(X, y)
    return xgb
Ejemplo n.º 52
0
data.lon.unique().shape

data_x=pd.get_dummies(data.action_type,prefix="action_type")
cols=["combined_shot_type","game_event_id","period","playoffs",
      "shot_type","shot_zone_area","shot_zone_basic","shot_zone_range",
      "matchup","opponent","game_date","shot_distance","minutes_remaining","seconds_remaining",
      "loc_x","loc_y"]
for col in cols:
    data_x=pd.concat([data_x,pd.get_dummies(data[col],prefix=col),],axis=1)
train_x=data_x[-pd.isnull(data.shot_made_flag)]
test_x=data_x[pd.isnull(data.shot_made_flag)]
train_y=data.shot_made_flag[-pd.isnull(data.shot_made_flag)]

clf = XGBClassifier(max_depth=6, learning_rate=0.01, n_estimators=550,
                     subsample=0.5, colsample_bytree=0.5, seed=0)
clf.fit(train_x, train_y)
y_pred = clf.predict(train_x)
print("Number of mislabeled points out of a total %d points : %d"  % (train_x.shape[0],(train_y != y_pred).sum()))

def logloss(act, pred):
    epsilon = 1e-15
    pred = sp.maximum(epsilon, pred)
    pred = sp.minimum(1-epsilon, pred)
    ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
    ll = ll * -1.0/len(act)
    print(ll)
    return ll
    
logloss(train_y,clf.predict_proba(train_x)[:,1])

test_y=clf.predict_proba(test_x)[:,1]
Ejemplo n.º 53
0
def xgbost(x,y,targetx):
    clf_xgb = XGBClassifier(n_estimators=1000,max_depth=6, learning_rate=0.0075,subsample=0.7,colsample_bytree=0.7,seed=4)
    clf_xgb.fit(x,y)
    return clf_xgb.predict_proba(targetx)[:,1]
#subsample : float    Subsample ratio of the training instance.
#colsample_bytree : float    Subsample ratio of columns when constructing each tree.
#seed : int    Random number seed.

xgb = XGBClassifier(max_depth=6,
                    learning_rate=0.1,
                    n_estimators=30,
                    objective='multi:softprob',
                    subsample=0.8,
                    colsample_bytree=0.8,
                    min_child_weight=1,
                    seed=0)
#fits test values and and encoded labels
eval_set = [(X, y)]

xgb.fit(X, y, eval_set=eval_set, eval_metric='mlogloss')
#predicts coresponding class labels in the case of classification
#predicts the probability of a user belonging to a class (country)
#outputs a numpy array of shape (n_samples, n_classes)
Ypred = xgb.predict_proba(X_test)

#Taking the 5 classes with highest probabilities
IDS = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    IDS += [idx] * 5
    #Transform array or sparse matrix X back to feature mappings.
    cts += le.inverse_transform(
        np.argsort(Ypred[i])[::-1])[:5].tolist()
Ejemplo n.º 55
0
print data_train.shape
print data_test.shape


print 'Started Computing train set labels'
label_set = np.sign(label_set['Click'])
label_set[label_set == -1] = 0
print 'Finished computing train set labels'

# fit estimator
print "start XGBClassifier"
n_samples = data_train.shape[0]
est=XGBClassifier(n_estimators=200, learning_rate=0.1, silent= False)

print "start fitting"
est.fit(data_train, label_set)
# predict class labels
probs = est.predict_proba(data_test)

print "cross validation start"
cv = cross_validation.ShuffleSplit(n_samples, n_iter=10, random_state=0)
scores = cross_validation.cross_val_score(est, data_train, label_set, cv=cv)
mean = np.mean(probs[:, 1])
std = np.std(probs[:, 1])
print "Test predicted Mean:", mean
print "Test predicted STD:", std
df = pd.DataFrame(probs[:, 1])
df.columns = ["Prediction"]
df.index += 1
df.to_csv("output_prediction.csv", index_label="Id")
Ejemplo n.º 56
0
def xgboost_algorithm(XTrain,YTrain,XTest):
    xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)                  
    xgb.fit(XTrain, YTrain)
    y_pred_xgboost = xgb.predict_proba(XTest) 
    return y_pred_xgboost
Ejemplo n.º 57
0
def model1(df_train, df_test):
	print('model1')

	print('rows', df_train.shape[0]) 

	#remove rows with no sessions data
	hassessions = df_train['HasSessions']
	df_train = df_train.drop(hassessions[hassessions == 0].index)

	#remove rows older than 1/1/2014
	#dac2 = df_train.date_account_created.apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
	#print('removing rows', len(dac2[dac2 < datetime.strptime('20140101', '%Y%m%d')].index))
	#df_train = df_train.drop(dac2[dac2 < datetime.strptime('20140101', '%Y%m%d')].index)

	print('rows', df_train.shape[0]) 

	labels = df_train['country_destination'].values
	df_train = df_train.drop(['country_destination'], axis=1)
	piv_train = df_train.shape[0]

	#Creating a DataFrame with train+test data
	df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
	#Removing id and date_first_booking
	df_all = df_all.drop(['id', 'date_first_booking', 'sessions_count', 'HasSessions'], axis=1)

	#Filling nan
	df_all = df_all.fillna(-1)

	#####Feature engineering#######
	print('features in the csv', df_all.shape[1])

	#date_account_created
	print('dac', datetime.now())
	dac = np.vstack(df_all.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values)
	df_all['dac_year'] = dac[:,0]
	df_all['dac_month'] = dac[:,1]
	df_all['dac_day'] = dac[:,2]

	#day of week, seazon
	print('dac2', datetime.now())
	dac2 = df_all.date_account_created.apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
	df_all['dac_weekday'] = dac2.apply(lambda x: x.weekday())
	df_all['dac_season'] = dac2.apply(calculate_season)

	df_all = df_all.drop(['date_account_created'], axis=1)

	#timestamp_first_active
	print('tfa', datetime.now())
	tfa = np.vstack(df_all.timestamp_first_active.astype(str).apply(lambda x: list(map(int, [x[:4],x[4:6],x[6:8],x[8:10],x[10:12],x[12:14]]))).values)
	df_all['tfa_year'] = tfa[:,0]
	df_all['tfa_month'] = tfa[:,1]
	df_all['tfa_day'] = tfa[:,2]
	df_all = df_all.drop(['timestamp_first_active'], axis=1)

	#Age
	print('age', datetime.now())
	av = df_all.age.values
	df_all['age'] = np.where(np.logical_or(av<14, av>100), -1, av)

	#remove features
	print('remove features', datetime.now())
	df_all = df_all.drop(['Sessions' + str(i) for i in [0]], axis=1)
	df_all = df_all.drop(['SessionsD' + str(i) for i in range(456)], axis=1)

	print('features in the model', df_all.shape[1])

	#One-hot-encoding features
	print('one-hot', datetime.now())
	ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser', 'dac_season', 'sessions_preferred_device'] 

	for f in ohe_feats:
		df_all_dummy = pd.get_dummies(df_all[f], prefix=f)
		df_all = df_all.drop([f], axis=1)
		df_all = pd.concat((df_all, df_all_dummy), axis=1)

	#Splitting train and test
	vals = df_all.values
	X = vals[:piv_train]
	y = labels
	X_predict = vals[piv_train:]

	#learning_rate, max_depth, ss_cs, gamma, min_child_weight, reg_lambda, reg_alpha =  0.03, 6, 0.5, 2, 2, 2, 1
	learning_rate, max_depth, ss_cs, gamma, min_child_weight, reg_lambda, reg_alpha =  0.03, 8, 0.5, 2, 1, 2, 0

	early_stopping_rounds = 25
	if learning_rate <= 0.03:
		early_stopping_rounds = 50

	print(learning_rate, max_depth, ss_cs, gamma, min_child_weight, reg_lambda, reg_alpha)

	#n_estimators = 455
	n_estimators = 350
	#n_estimators = 1
	print(n_estimators)

	print('fit start', datetime.now())
	clf2 = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha, nthread=-1)      
	clf2.fit(X, y)
	y_predicted2 = clf2.predict_proba(X_predict)  

	return y_predicted2
train.drop(x, axis=1, inplace=True)
test.drop(x, axis=1, inplace=True)

y_train = train['TARGET'].values
X_train = train.drop(['ID','TARGET'], axis=1).values

y_test = test['ID']
X_test = test.drop(['ID'], axis=1).values

xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=600,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.6815,
 colsample_bytree=0.701,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

xgtrain = xgb.DMatrix(X_train, label=y_train)
cvresult = xgb.cv(xgb1.get_xgb_params(), xgtrain, num_boost_round=xgb1.get_params()['n_estimators'], nfold=5,
metrics=['auc'], early_stopping_rounds=50, show_progress=False)
xgb1.set_params(n_estimators=cvresult.shape[0])
xgb1.fit(X_train, y_train, eval_metric='auc')
output = xgb1.predict_proba(X_test)[:,1]

submission = pd.DataFrame({"ID":y_test, "TARGET":output})
submission.to_csv("submission.csv", index=False)
Ejemplo n.º 59
0
del device_freq
del action_freq

#Splitting train and test
vals = df_all.values
X = vals[:piv_train]
le = LabelEncoder()
y = le.fit_transform(labels)   
X_test = vals[piv_train:]

#Classifier
xgb = XGBClassifier(max_depth=6, learning_rate=0.25, n_estimators=43,
                    objective='multi:softprob', subsample=0.6, colsample_bytree=0.6, seed=0)                  

print('scores:', NDCG.cross_validation_score(X, labels,xgb,5))
'''
xgb.fit(X, y)
y_pred = xgb.predict_proba(X_test)  

#Taking the 5 classes with highest probabilities
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

#Generate submission
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('sub.csv',index=False)
'''
Ejemplo n.º 60
0
Archivo: test1.py Proyecto: mircean/ML
def do_cell(task):
    df_train, df_test, x_start, y_start = task[0], task[1], task[2], task[3]
    #print('do_cell', df_train.shape, df_test.shape, x_start, y_start)

    #train
    n_places_th_local = n_places_th
    n_places_local = n_places

    if n_places != 0:
        tmp = df_train.shape[0]
        value_counts = df_train.place_id.value_counts()[0:n_places]
        df_train = pd.merge(df_train, pd.DataFrame(value_counts), left_on='place_id', right_index=True)[df_train.columns]
        n_places_th_local = value_counts.values[n_places - 1]
        percentage = df_train.shape[0]/tmp

    elif n_places_th != 0:
        value_counts = df_train.place_id.value_counts()
        n_places_local = value_counts[value_counts >= n_places_th_local].count()
        mask = value_counts[df_train.place_id.values] >= n_places_th_local
        percentage = mask.value_counts()[True]/df_train.shape[0]
        df_train = df_train.loc[mask.values]

    else:
        n_places_th_local = 2

        value_counts = df_train.place_id.value_counts()
        n_places_local = value_counts[value_counts >= n_places_th_local].count()
        mask = value_counts[df_train.place_id.values] >= n_places_th_local
        percentage = mask.value_counts()[True]/df_train.shape[0]

        while percentage > n_places_percentage:
            n_places_th_local += 1
            n_places_local = value_counts[value_counts >= n_places_th_local].count()
            mask = value_counts[df_train.place_id.values] >= n_places_th_local
            percentage = mask.value_counts()[True]/df_train.shape[0]

        n_places_th_local -= 1
        n_places_local = value_counts[value_counts >= n_places_th_local].count()
        mask = value_counts[df_train.place_id.values] >= n_places_th_local
        percentage = mask.value_counts()[True]/df_train.shape[0]

        df_train = df_train.loc[mask.values]


    #print(x_start, y_start, n_places_local, n_places_th_local, percentage)
        
    #test
    row_ids = df_test.index
    if 'place_id' in df_test.columns:
        df_test = df_test.drop(['place_id'], axis=1)

    le = LabelEncoder()
    y = le.fit_transform(df_train.place_id.values)
    
    X = df_train.drop(['place_id'], axis=1).values
    X_predict = df_test.values

    score = 0
    n_estimators = 0
    if xgb == 1:    
        if xgb_calculate_n_estimators == True:
            clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=5000, objective='multi:softprob', subsample=ss, colsample_bytree=cs, gamma=gamma, min_child_weight=min_child_weight, reg_lambda=reg_lambda, reg_alpha=reg_alpha)

            if train_test == 1:
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
   
                clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric=calculate_score, early_stopping_rounds=early_stopping_rounds, verbose=10 if one_cell == 1 else False)
                score = round(1 - clf.booster().best_score, 6)
                n_estimators = clf.booster().best_ntree_limit
            else:
                abc += 1
                xgb_options = clf.get_xgb_params()
                xgb_options['num_class'] = n_places + 1
                train_dmatrix = DMatrix(X, label=y)

                #some of the classes have less than n_folds, cannot use stratified KFold
                #folds = StratifiedKFold(y, n_folds=n_folds, shuffle=True)
                folds = KFold(len(y), n_folds=n_folds, shuffle=True)
                cv_results = cv(xgb_options, train_dmatrix, clf.n_estimators, early_stopping_rounds=early_stopping_rounds, verbose_eval=10 if one_cell == 1 else False, show_stdv=False, folds=folds, feval=calculate_score)

                n_estimators = cv_results.shape[0]
                score = round(1 - cv_results.values[-1][0], 6)
                std = round(cv_results.values[-1][1], 6)
        else:
            n_estimators = n_estimators_fixed

        clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective='multi:softprob', subsample=ss, colsample_bytree=cs, gamma=gamma, min_child_weight=min_child_weight, reg_lambda=reg_lambda, reg_alpha=reg_alpha)
    else:
        clf = RandomForestClassifier(n_estimators = 300, n_jobs = -1)
        if rf_calculate_score == True:
            if train_test == 1:
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
                y_train2 = le.transform(y_train)
                y_test2 = le.transform(y_test)
    
                clf.fit(X_train, y_train2)
                y_predict = clf.predict_proba(X_test)

                scores_local = []
                for i in range(X_test.shape[0]):
                    score = calculate_score_per_row(y_predict[i], y_test2[i])
                    scores_local.append(score)

                score = np.array(scores_local).mean()
            else:
                #some of the classes have less than n_folds, cannot use stratified KFold
                #folds = StratifiedKFold(y, n_folds=n_folds, shuffle=True)
                folds = KFold(len(y), n_folds=n_folds, shuffle=True)
                scores_cv = []
                for train, test in folds:
                    X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]

                    y_train2 = le.transform(y_train)
                    y_test2 = le.transform(y_test)
    
                    clf.fit(X_train, y_train2)
                    y_predict = clf.predict_proba(X_test)

                    scores_local = []
                    for i in range(X_test.shape[0]):
                        score = calculate_score_per_row(y_predict[i], y_test2[i])
                        scores_local.append(score)

                    score = np.array(scores_local).mean()
                    print('  ', x_start, y_start, score)
                    scores_cv.append(score)

                score = np.array(scores_cv).mean()
    
    #if few_cells == 1 or grid_search == 1:
    #    return [score, None, None]

    clf.fit(X, y)
    y_predict = clf.predict_proba(X_predict)
    ##1
    labels_predict = le.inverse_transform(np.argsort(y_predict, axis=1)[:,::-1][:,:n_topx])    

    print(x_start, y_start, score, n_estimators, n_places_local, n_places_th_local, percentage)

    return [score, row_ids, labels_predict]