Beispiel #1
0
def eval_metric(yhat, dtrain):
    y = dtrain.get_label()
    yhat = np.argmax(yhat, axis=1)
    predicted = [LABELS[int(a)] for a in yhat]
    actual = [LABELS[int(a)] for a in y]
    s, _ = score_submission(actual, predicted)
    s_perf, _ = score_submission(actual, actual)
    score = float(s) / s_perf
    return 'score', score
Beispiel #2
0
        X_train = np.vstack(tuple([Xs[i] for i in ids]))
        y_train = np.hstack(tuple([ys[i] for i in ids]))

        X_test = Xs[fold]
        y_test = ys[fold]

        clf = GradientBoostingClassifier(n_estimators=200,
                                         random_state=14128,
                                         verbose=True)
        clf.fit(X_train, y_train)

        predicted = [LABELS[int(a)] for a in clf.predict(X_test)]
        actual = [LABELS[int(a)] for a in y_test]

        fold_score, _ = score_submission(actual, predicted)
        max_fold_score, _ = score_submission(actual, actual)

        score = fold_score / max_fold_score

        print("Score for fold " + str(fold) + " was - " + str(score))
        if score > best_score:
            best_score = score
            best_fold = clf

    #Run on Holdout set and report the final score on the holdout set
    predicted = [LABELS[int(a)] for a in best_fold.predict(X_holdout)]
    actual = [LABELS[int(a)] for a in y_holdout]

    print("Scores on the dev set")
    report_score(actual, predicted)
Beispiel #3
0
def cv():

    K_FOLD = 10
    data = load_features().astype(float)
    targets = load_targets()

    # K-Fold and Score Tracking
    scores = []
    wscores = []
    pscores = []
    n_folds = 10
    best_iters = [0] * n_folds
    # kf = GroupKFold(n_splits=K_FOLD)
    kf = StratifiedKFold(n_splits=K_FOLD)
    print('Training Model...')
    for fold, (train_idx,
               test_idx) in enumerate(kf.split(data, targets['target'])):
        print('\n[K = ' + str(fold + 1) + ']')

        # Train Model
        dtrain = xgb.DMatrix(data[train_idx], label=targets[train_idx])
        dtest = xgb.DMatrix(data[test_idx])
        watchlist = [(dtrain, 'train'), (dtest, 'eval')]
        bst = xgb.train(
            params_xgb,
            dtrain,
            num_round,
            watchlist,
            verbose_eval=10,
            #feval = eval_metric,
            #maximize = True,
            early_stopping_rounds=80)

        pred_prob_y = bst.predict(dtest).reshape(targets[test_idx].shape[0],
                                                 2)  # predicted probabilities
        pred_y = bst.predict(dtest, ntree_limit=bst.best_ntree_limit).reshape(
            targets[test_idx].shape[0], 2)
        print('predicted probabilities: ')
        print(pred_y)
        pred_y = np.argmax(pred_y, axis=1)
        print('predicted label indices: ')
        print(pred_y)

        print('best iterations: ', bst.best_ntree_limit)
        best_iters[fold] = bst.best_ntree_limit

        print(pred_y)
        print('pred_y.shape')
        print(pred_y.shape)
        print('y_valid.shape')
        print(targets[test_idx].shape)

        predicted = [LABELS[int(a)] for a in pred_y]
        actual = [LABELS[int(a)] for a in targets[test_idx]]
        s, _ = score_submission(actual, predicted)
        s_perf, _ = score_submission(actual, actual)
        r_score = report_score(actual, predicted)
        wscore = float(s) / s_perf
        print('report_score', r_score)
        print('fold %s, score = %f, perfect_score %f, weighted percentage %f' %
              (fold, s, s_perf, wscore))
        scores.append(s)
        pscores.append(s_perf)
        wscores.append(wscore)

    print('scores:')
    print(scores)
    print('mean score:')
    print(np.mean(scores))
    print('perfect scores:')
    print(pscores)
    print('mean perfect score:')
    print(np.mean(pscores))
    print('w scores:')
    print(wscores)
    print('mean w score:')
    print(np.mean(wscores))
    print('best iters:')
    print(best_iters)
    print('mean best_iter:')
    m_best = np.mean(best_iters)
    print(m_best)