def eval_fn(params): model = XGBClassifier(n_estimators=n_estimators_max, learning_rate=learning_rate, seed=seed) score = 0 n_estimators = 0 for tr, va in skf: X_tr, y_tr = X_train[tr], y_train[tr] X_va, y_va = X_train[va], y_train[va] model.set_params(**params) model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], eval_metric='logloss', early_stopping_rounds=50, verbose=False) score += model.best_score n_estimators += model.best_iteration score /= n_folds n_estimators /= n_folds n_estimators_lst.append(n_estimators) result_str = "train:%.4f ntree:%5d " % (score, n_estimators) if X_valid is not None: model.n_estimators = n_estimators model.fit(X_train, y_train) pr = model.predict_proba(X_valid)[:,1] sc_valid = log_loss(y_valid, pr) score_valid.append(sc_valid) result_str += "valid:%.4f" % sc_valid if verbose: print result_str return score
#Gradient boosting xgb = XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=10000, objective='multi:softprob', seed=random_state) #Computing best number of iterations on an internal validation set XV_train, XV_valid, yv_train, yv_valid = train_test_split( XV, y_valid, test_size=0.15, random_state=random_state) xgb.fit(XV_train, yv_train, eval_set=[(XV_valid, yv_valid)], eval_metric='mlogloss', early_stopping_rounds=15, verbose=False) xgb.n_estimators = xgb.best_iteration xgb.fit(XV, y_valid) y_gb = xgb.predict_proba(XT) ll_gb.append(log_loss(y_test, y_gb)) #Saving the logloss score ll_sc = np.array(ll_sc).reshape(-1, len(clfs)).T ll_eA = np.array(ll_eA) ll_eB = np.array(ll_eB) ll_e3 = np.array(ll_e3) ll_lr = np.array(ll_lr) ll_gb = np.array(ll_gb) # ## Plotting the results # Notice that sklearn LogisticRegression and XGBoost produce better results for problems with few classes, but as the number of classes increases # the proposed ensembling methods outperform LogisticRegression and XGBoost. Again the question here is whether it is possible to fine-tune
for train_index, test_index in folds: #has to be created here because warm start clf = RandomForestClassifier(n_estimators=10, warm_start=True, n_jobs=-1) X_train2, X_test2 = X_train.loc[train_index], X_train.loc[test_index] y_train2, y_test2 = y_train[train_index], y_train[test_index] X_train2, X_test2 = feature_engineering_extra(X_train2, X_test2, y_train2) X_train2 = csr_matrix(X_train2.values) X_test2 = csr_matrix(X_test2.values) score = 100 iteration = 0 for i in range(1000): clf.n_estimators = 10*(i+1) clf.fit(X_train2, y_train2) y_pred = clf.predict_proba(X_test2) score_tmp = log_loss(y_test2, y_pred) if score_tmp < score: score = score_tmp iteration = i if i > iteration + 100: break print(score, clf.n_estimators) scores.append(round(score, 6)) iterations.append(clf.n_estimators) scores = np.array(scores) iterations = np.array(iterations)
def clf_xgboost(data, cl_weight=None, random_state=0, ext_name="", verbose=True): """ XGBoost classifier The function applies the classifier twice: - First: Fit the classifier to (X_train, y_train) and predict on (X_valid). The prediction is stored in 'save/valid' folder. - Second: Fit the classifier to (X, y) = (X_train + X_valid, y_train + y_valid) and predict on (X_test). The prediction is stored in 'save/test' folder. Parameters: ---------- data: list [X_train, y_train, X_valid, y_valid, X_test] cl_weight: None or Dictionary Class weights, e.g. {0:1, 1:1.5, 2:1.6...} => weight for class 0 is 1, for class 1 is 1.5, for class 2 is 1.6, and so on. random_state: numpy RandomState RandomState used for reproducibility ext_name: string Extra string to be used in the name of the stored prediction, e.g. it can be used to identify specific parameter values that were used. Result: ------ y_valid_pred: numpy ndarray shape=(n_samples_validation, n_classes) Labels of the predictions for the validation set. y_test_pred: numpy ndarray shape=(n_samples_test, n_classes) Labels of the predictions for the test set. Save: ---- y_valid_pred: it is stored in save/valid folder y_test_pred: it is stored in save/test folder """ xgb = XGBClassifier(max_depth=6, learning_rate=0.01, n_estimators=10000, objective='multi:softprob', gamma=1., min_child_weight=1., max_delta_step=5., subsample=0.7, colsample_bytree=0.7, reg_alpha=0., reg_lambda=1., seed=random_state) X_train, y_train, X_valid, y_valid, X_test = data ###Working on (X_Train => X_Valid)### ss = StandardScaler() XX_train = ss.fit_transform(X_train) XX_valid = ss.transform(X_valid) lb = LabelBinarizer() lb.fit(y_train) yb_valid = lb.transform(y_valid) if cl_weight == None: xgb.fit(XX_train, y_train, eval_set=[(XX_valid, y_valid)], eval_metric='mlogloss', early_stopping_rounds=25, verbose=verbose) else: #Computing sample weights from class weights sw_train = compute_sample_weight(class_weight=cl_weight, y=y_train) xgb.fit(XX_train, y_train, sample_weight=sw_train, eval_set=[(XX_valid, y_valid)], eval_metric='mlogloss', early_stopping_rounds=25, verbose=verbose) best_iter = xgb.best_iteration y_valid_pred = xgb.predict_proba(XX_valid, ntree_limit=best_iter) ndcg_xg = np.mean([ ndcg_score(tr, pr, k=5) for tr, pr in zip(yb_valid.tolist(), y_valid_pred.tolist()) ]) print 'NDCG: %s' % (ndcg_xg) logloss_xg = log_loss(y_valid, y_valid_pred) print 'Log-loss: %s' % (logloss_xg) rnd = random_state.randint(1000, 9999) pickle.dump( y_valid_pred, open( 'save/valid/v_XGB_%s_%s_%s_%s' % (ext_name, rnd, round(ndcg_xg, 4), round(logloss_xg, 4)), 'w')) ###Working on X => X_test### X = np.vstack((X_train, X_valid)) y = np.hstack((y_train, y_valid)) XX = ss.fit_transform(X) XX_test = ss.transform(X_test) xgb.n_estimators = best_iter + 20 if cl_weight == None: xgb.fit(XX, y) else: sw = compute_sample_weight(class_weight=cl_weight, y=y) xgb.fit(XX, y, sample_weight=sw) y_test_pred = xgb.predict_proba(XX_test) pickle.dump(y_test_pred, open('save/test/t_XGB_%s_%s' % (ext_name, rnd), 'w')) return y_valid_pred, y_test_pred