Ejemplo n.º 1
0
 def eval_fn(params):
     model = XGBClassifier(n_estimators=n_estimators_max, learning_rate=learning_rate, seed=seed)
     score = 0
     n_estimators = 0
     for tr, va in skf:
         X_tr, y_tr = X_train[tr], y_train[tr]
         X_va, y_va = X_train[va], y_train[va]
         model.set_params(**params)
         model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], eval_metric='logloss',
                   early_stopping_rounds=50, verbose=False)
         score += model.best_score
         n_estimators += model.best_iteration
     score /= n_folds
     n_estimators /= n_folds
     n_estimators_lst.append(n_estimators)
     result_str = "train:%.4f ntree:%5d  " % (score, n_estimators)
     if X_valid is not None:
         model.n_estimators = n_estimators
         model.fit(X_train, y_train)
         pr = model.predict_proba(X_valid)[:,1]
         sc_valid = log_loss(y_valid, pr)
         score_valid.append(sc_valid)
         result_str += "valid:%.4f" % sc_valid
     if verbose:
         print result_str
     return score
Ejemplo n.º 2
0
    #Gradient boosting
    xgb = XGBClassifier(max_depth=5,
                        learning_rate=0.1,
                        n_estimators=10000,
                        objective='multi:softprob',
                        seed=random_state)
    #Computing best number of iterations on an internal validation set
    XV_train, XV_valid, yv_train, yv_valid = train_test_split(
        XV, y_valid, test_size=0.15, random_state=random_state)
    xgb.fit(XV_train,
            yv_train,
            eval_set=[(XV_valid, yv_valid)],
            eval_metric='mlogloss',
            early_stopping_rounds=15,
            verbose=False)
    xgb.n_estimators = xgb.best_iteration

    xgb.fit(XV, y_valid)
    y_gb = xgb.predict_proba(XT)
    ll_gb.append(log_loss(y_test, y_gb))  #Saving the logloss score

ll_sc = np.array(ll_sc).reshape(-1, len(clfs)).T
ll_eA = np.array(ll_eA)
ll_eB = np.array(ll_eB)
ll_e3 = np.array(ll_e3)
ll_lr = np.array(ll_lr)
ll_gb = np.array(ll_gb)

# ## Plotting the results
# Notice that sklearn LogisticRegression and XGBoost produce better results for problems with few classes, but as the number of classes increases
# the proposed ensembling methods outperform LogisticRegression and XGBoost. Again the question here is whether it is possible to fine-tune
Ejemplo n.º 3
0
            for train_index, test_index in folds:
                #has to be created here because warm start
                clf = RandomForestClassifier(n_estimators=10, warm_start=True, n_jobs=-1)

                X_train2, X_test2 = X_train.loc[train_index], X_train.loc[test_index]
                y_train2, y_test2 = y_train[train_index], y_train[test_index]

                X_train2, X_test2 = feature_engineering_extra(X_train2, X_test2, y_train2)

                X_train2 = csr_matrix(X_train2.values)
                X_test2 = csr_matrix(X_test2.values)

                score = 100
                iteration = 0
                for i in range(1000):
                    clf.n_estimators = 10*(i+1)
                    clf.fit(X_train2, y_train2)
                    y_pred = clf.predict_proba(X_test2)
                    score_tmp = log_loss(y_test2, y_pred)
                    if score_tmp < score:
                        score = score_tmp
                        iteration = i
                    if i > iteration + 100:
                        break

                print(score, clf.n_estimators)
                scores.append(round(score, 6))
                iterations.append(clf.n_estimators)

            scores = np.array(scores)
            iterations = np.array(iterations)
Ejemplo n.º 4
0
def clf_xgboost(data,
                cl_weight=None,
                random_state=0,
                ext_name="",
                verbose=True):
    """
    XGBoost classifier
    The function applies the classifier twice:
    - First: Fit the classifier to (X_train, y_train) and predict on (X_valid).
             The prediction is stored in 'save/valid' folder.
    - Second: Fit the classifier to (X, y) = (X_train + X_valid, y_train + y_valid)
             and predict on (X_test). The prediction is stored in 'save/test' 
             folder.
             
    Parameters:
    ----------
    data: list
         [X_train, y_train, X_valid, y_valid, X_test]
    cl_weight: None or Dictionary
         Class weights, e.g. {0:1, 1:1.5, 2:1.6...} => weight for class 0 is 1, 
         for class 1 is 1.5, for class 2 is 1.6, and so on.
    random_state: numpy RandomState
         RandomState used for reproducibility
    ext_name: string
         Extra string to be used in the name of the stored prediction, e.g. it 
         can be used to identify specific parameter values that were used.
         
    Result:
    ------
    y_valid_pred: numpy ndarray shape=(n_samples_validation, n_classes)
              Labels of the predictions for the validation set.
    y_test_pred: numpy ndarray shape=(n_samples_test, n_classes)
              Labels of the predictions for the test set.
              
    Save:
    ----
    y_valid_pred: it is stored in save/valid folder
    y_test_pred: it is stored in save/test folder     
    """

    xgb = XGBClassifier(max_depth=6,
                        learning_rate=0.01,
                        n_estimators=10000,
                        objective='multi:softprob',
                        gamma=1.,
                        min_child_weight=1.,
                        max_delta_step=5.,
                        subsample=0.7,
                        colsample_bytree=0.7,
                        reg_alpha=0.,
                        reg_lambda=1.,
                        seed=random_state)

    X_train, y_train, X_valid, y_valid, X_test = data

    ###Working on (X_Train => X_Valid)###
    ss = StandardScaler()
    XX_train = ss.fit_transform(X_train)
    XX_valid = ss.transform(X_valid)

    lb = LabelBinarizer()
    lb.fit(y_train)
    yb_valid = lb.transform(y_valid)

    if cl_weight == None:
        xgb.fit(XX_train,
                y_train,
                eval_set=[(XX_valid, y_valid)],
                eval_metric='mlogloss',
                early_stopping_rounds=25,
                verbose=verbose)
    else:
        #Computing sample weights from class weights
        sw_train = compute_sample_weight(class_weight=cl_weight, y=y_train)
        xgb.fit(XX_train,
                y_train,
                sample_weight=sw_train,
                eval_set=[(XX_valid, y_valid)],
                eval_metric='mlogloss',
                early_stopping_rounds=25,
                verbose=verbose)

    best_iter = xgb.best_iteration
    y_valid_pred = xgb.predict_proba(XX_valid, ntree_limit=best_iter)

    ndcg_xg = np.mean([
        ndcg_score(tr, pr, k=5)
        for tr, pr in zip(yb_valid.tolist(), y_valid_pred.tolist())
    ])
    print 'NDCG: %s' % (ndcg_xg)
    logloss_xg = log_loss(y_valid, y_valid_pred)
    print 'Log-loss: %s' % (logloss_xg)

    rnd = random_state.randint(1000, 9999)
    pickle.dump(
        y_valid_pred,
        open(
            'save/valid/v_XGB_%s_%s_%s_%s' %
            (ext_name, rnd, round(ndcg_xg, 4), round(logloss_xg, 4)), 'w'))

    ###Working on X => X_test###
    X = np.vstack((X_train, X_valid))
    y = np.hstack((y_train, y_valid))

    XX = ss.fit_transform(X)
    XX_test = ss.transform(X_test)

    xgb.n_estimators = best_iter + 20

    if cl_weight == None:
        xgb.fit(XX, y)
    else:
        sw = compute_sample_weight(class_weight=cl_weight, y=y)
        xgb.fit(XX, y, sample_weight=sw)

    y_test_pred = xgb.predict_proba(XX_test)

    pickle.dump(y_test_pred,
                open('save/test/t_XGB_%s_%s' % (ext_name, rnd), 'w'))

    return y_valid_pred, y_test_pred