Esempio n. 1
0
from gini_normalized import normalized_gini
import numpy as np
from pylab import *

hold = pd.read_csv('../data/hold_new.csv')
preds = pd.read_csv('preds_on_hold/xgbt.csv')


def binar(x, a):
    if 53 < x < a:
        return 53
    elif a <= x < 62:
        return 62
    else:
        return x


x_list = range(54, 62)

y_list = []

for a in x_list:
    y_list += [
        normalized_gini(hold['Hazard'],
                        map(lambda x: binar(x, a), preds['Hazard']))
    ]

print x_list
print y_list
plot(x_list, y_list)
savefig('cuts.png')
Esempio n. 2
0
                                    labels = y_train[::-1]

                                    xgtrain = xgb.DMatrix(X_train[offset:, :], label=labels[offset:])
                                    xgval = xgb.DMatrix(X_train[:offset, :], label=labels[:offset])

                                    watchlist = [(xgtrain, "train"), (xgval, "val")]

                                    model = xgb.train(
                                        params_new, xgtrain, num_rounds, watchlist, early_stopping_rounds=120
                                    )

                                    preds2 = model.predict(xgtest, ntree_limit=model.best_iteration)

                                    preds = 0.5 * preds1 + 0.5 * preds2

                                    tp = normalized_gini(y_test, preds)
                                    score += [tp]
                                    print tp

                                sc = math.ceil(100000 * np.mean(score)) / 100000
                                sc_std = math.ceil(100000 * np.std(score)) / 100000
                                result += [
                                    (
                                        sc,
                                        sc_std,
                                        min_child_weight,
                                        eta,
                                        colsample_bytree,
                                        max_depth,
                                        subsample,
                                        gamma,
Esempio n. 3
0
 For example 53-62 is not in the train => looks like an idea to put values in this region to
 53 or to 62, depending on the threashold
'''

import pandas as pd
from gini_normalized import normalized_gini
import numpy as np
from pylab import *

hold = pd.read_csv('../data/hold_new.csv')
preds = pd.read_csv('preds_on_hold/xgbt.csv')

def binar(x, a):
  if 53 < x < a:
    return 53
  elif a <= x < 62:
    return 62
  else:
    return x

x_list = range(54, 62)

y_list = []

for a in x_list:
  y_list += [normalized_gini(hold['Hazard'], map(lambda x: binar(x, a), preds['Hazard']))]

print x_list
print y_list
plot(x_list, y_list)
savefig('cuts.png')
Esempio n. 4
0
                                labels = y_train[::-1]

                                xgtrain = xgb.DMatrix(X_train[offset:, :], label=labels[offset:])
                                xgval = xgb.DMatrix(X_train[:offset, :], label=labels[:offset])

                                watchlist = [(xgtrain, "train"), (xgval, "val")]

                                model = xgb.train(params_new, xgtrain, num_rounds, watchlist, early_stopping_rounds=120)

                                preds2 = model.predict(xgtest, ntree_limit=model.best_iteration)

                                # preds = model.predict(xgval, ntree_limit=model.best_iteration)

                                preds = 0.5 * preds1 + 0.5 * preds2

                                tp = normalized_gini(y_test, preds)
                                # tp_up = normalized_gini(y_test, map(lambda x: min(69, x), preds))
                                # tp_down = normalized_gini(y_test, map(lambda x: max(1, x), preds))
                                tp_both = normalized_gini(y_test, map(lambda x: min(69, max(1, x)), preds))
                                # tp_both_round = normalized_gini(y_test, map(lambda x: round(min(69, max(1, x))), preds))
                                # tp_both_int = normalized_gini(y_test, map(lambda x: int(min(69, max(1, x))), preds))

                                # tp = normalized_gini(y_train[:offset], preds)
                                score += [tp]
                                # score_truncated_up += [tp_up]
                                # score_truncated_down += [tp_down]
                                score_truncated_both += [tp_both]
                                # score_truncated_both_int += [tp_both_int]
                                # score_truncated_both_round += [tp_both_round]
                                print tp
Esempio n. 5
0
                labels = y_train[::-1]

                xgtrain = xgb.DMatrix(X_train[offset:, :], label=labels[offset:])
                xgval = xgb.DMatrix(X_train[:offset, :], label=labels[:offset])

                watchlist = [(xgtrain, 'train'), (xgval, 'val')]

                model = xgb.train(params_new, xgtrain, num_rounds, watchlist, early_stopping_rounds=120)

                preds2 = model.predict(xgtest, ntree_limit=model.best_iteration)

                # preds = model.predict(xgval, ntree_limit=model.best_iteration)

                preds = 0.5 * preds1 + 0.5 * preds2

                tp = normalized_gini(y_test, preds)
                # tp_up = normalized_gini(y_test, map(lambda x: min(69, x), preds))
                # tp_down = normalized_gini(y_test, map(lambda x: max(1, x), preds))
                tp_both = normalized_gini(y_test, map(lambda x: min(69, max(1, x)), preds))
                # tp_both_round = normalized_gini(y_test, map(lambda x: round(min(69, max(1, x))), preds))
                # tp_both_int = normalized_gini(y_test, map(lambda x: int(min(69, max(1, x))), preds))

                # tp = normalized_gini(y_train[:offset], preds)
                score += [tp]
                # score_truncated_up += [tp_up]
                # score_truncated_down += [tp_down]
                score_truncated_both += [tp_both]
                # score_truncated_both_int += [tp_both_int]
                # score_truncated_both_round += [tp_both_round]
                print tp
Esempio n. 6
0
                            clf = RandomForestRegressor(
                                n_estimators=n_estimators,
                                min_samples_split=min_samples_split,
                                max_features=max_features,
                                max_depth=max_depth,
                                min_samples_leaf=min_samples_leaf,
                                n_jobs=-1,
                                random_state=random_state,
                            )

                            clf.fit(a_train, b_train)

                            preds = clf.predict(a_test)

                            score += [normalized_gini(b_test, preds)]

                        result += [
                            (
                                np.mean(score),
                                np.std(score),
                                n_estimators,
                                min_samples_split,
                                min_samples_leaf,
                                max_depth,
                                max_features,
                            )
                        ]

    result.sort()
    print result
Esempio n. 7
0
              b_train = y.values[train_index]
              b_test = y.values[test_index]

              clf = RandomForestRegressor(n_estimators=n_estimators,
                                          min_samples_split=min_samples_split,
                                          max_features=max_features,
                                          max_depth=max_depth,
                                          min_samples_leaf=min_samples_leaf,
                                          n_jobs=-1,
                                          random_state=random_state)

              clf.fit(a_train, b_train)

              preds = clf.predict(a_test)

              score += [normalized_gini(b_test, preds)]

            result += [(np.mean(score), np.std(score), n_estimators, min_samples_split, min_samples_leaf, max_depth, max_features)]

  result.sort()
  print result

elif ind == 3:
  clf = RandomForestRegressor(n_estimators=100,
                              min_samples_split=2,
                              max_features=0.4,
                              max_depth=7,
                              min_samples_leaf=1,
                              n_jobs=-1,
                              random_state=random_state)
  clf.fit(X, y)
Esempio n. 8
0
  xgtrain = xgb.DMatrix(X_train[offset:, :], label=labels[offset:])
  xgval = xgb.DMatrix(X_train[:offset, :], label=labels[:offset])

  watchlist = [(xgtrain, 'train'), (xgval, 'val')]

  model = xgb.train(params_new, xgtrain, num_rounds, watchlist, early_stopping_rounds=120)

  preds2 = model.predict(xgtest, ntree_limit=model.best_iteration)

  preds_xgbt = 0.5 * preds1 + 0.5 * np.exp(preds2)


  alpha = 0
  prediction = preds_xgbt
  tp = normalized_gini(y_test, prediction)
  score_00 += [normalized_gini(y_test, prediction)]
  alpha = 0.1
  prediction = alpha * preds_RF + (1 - alpha) * preds_xgbt
  score_01 += [normalized_gini(y_test, prediction)]

  alpha = 0.2
  prediction = alpha * preds_RF + (1 - alpha) * preds_xgbt
  score_02 += [normalized_gini(y_test, prediction)]

  alpha = 0.3
  prediction = alpha * preds_RF + (1 - alpha) * preds_xgbt
  score_03 += [normalized_gini(y_test, prediction)]

  alpha = 0.4
  prediction = alpha * preds_RF + (1 - alpha) * preds_xgbt
Esempio n. 9
0
              b_train = y.values[train_index]
              b_test = y.values[test_index]

              clf = RandomForestRegressor(n_estimators=n_estimators,
                                          min_samples_split=min_samples_split,
                                          max_features=max_features,
                                          max_depth=max_depth,
                                          min_samples_leaf=min_samples_leaf,
                                          n_jobs=-1,
                                          random_state=random_state)

              clf.fit(a_train, b_train)

              preds = clf.predict(a_test)

              score += [normalized_gini(map(lambda x: math.exp(x) - 1, b_test), map(lambda x: math.exp(x) - 1), preds)]

            result += [(np.mean(score), np.std(score), n_estimators, min_samples_split, min_samples_leaf, max_depth, max_features)]

  result.sort()
  print result


elif ind == 2:
  clf = RandomForestRegressor(n_estimators=100,
                              min_samples_split=2,
                              max_features=0.4,
                              max_depth=7,
                              min_samples_leaf=1,
                              n_jobs=-1,
                              random_state=random_state)
Esempio n. 10
0
                  X_train = X_train[::-1, :]
                  labels = y_train[::-1]

                  xgtrain = xgb.DMatrix(X_train[offset:, :], label=labels[offset:])
                  xgval = xgb.DMatrix(X_train[:offset, :], label=labels[:offset])

                  watchlist = [(xgtrain, 'train'), (xgval, 'val')]

                  model = xgb.train(params_new, xgtrain, num_rounds, watchlist, early_stopping_rounds=120)

                  preds2 = model.predict(xgtest, ntree_limit=model.best_iteration)

                  preds = 0.5 * preds1 + 0.5 * preds2

                  tp = normalized_gini(y_test, preds)
                  score += [tp]
                  print tp

                sc = math.ceil(100000 * np.mean(score)) / 100000
                sc_std = math.ceil(100000 * np.std(score)) / 100000
                result += [(sc,
                            sc_std,
                            min_child_weight,
                            eta,
                            colsample_bytree,
                            max_depth,
                            subsample,
                            gamma,
                            n_iter,
                            params['objective'],
Esempio n. 11
0
  a_train = X_train.values[train_index]
  a_test = X_train.values[test_index]
  b_train = y_train.values[train_index]
  b_test = y_train.values[test_index]

  X = scaler.fit_transform(a_train).astype(np.float32)
  X_reshaped = X.reshape(-1, 1, 10, 10)

  test = scaler.transform(a_test).astype(np.float32)
  test_reshaped = test.reshape(-1, 1, 10, 10)

  y = b_train[:]
  y.shape = (y.shape[0], 1)

  y_mean = y.mean()
  y_std = y.std()
  target = (y - y_mean) / y_std

  net1.fit(X_reshaped, target.astype(np.float32))

  def helper(x):
    return (x * y_std) + y_mean

  result = net1.predict(test_reshaped)

  result = np.reshape(result, len(b_test))
  result = map(helper, result)
  score += [normalized_gini(b_test, result)]


print np.mean(score), np.std(score)
Esempio n. 12
0
score = []

for train_index, test_index in rs:
  a_train = X_train.values[train_index]
  a_test = X_train.values[test_index]
  b_train = y_train.values[train_index]
  b_test = y_train.values[test_index]

  X = scaler.fit_transform(a_train).astype(np.float32)
  test = scaler.transform(a_test).astype(np.float32)

  y = b_train[:]
  y.shape = (y.shape[0], 1)

  y_mean = y.mean()
  y_std = y.std()
  target = (y - y_mean) / y_std

  net1.fit(X, target.astype(np.float32))

  def helper(x):
    return (x * y_std) + y_mean

  result = net1.predict(test)

  result = np.reshape(result, len(b_test))
  result = map(helper, result)
  score += [normalized_gini(b_test, result)]


print np.mean(score), np.std(score)
Esempio n. 13
0
                                watchlist = [(xgtrain, 'train'),
                                             (xgval, 'val')]

                                model = xgb.train(params_new,
                                                  xgtrain,
                                                  num_rounds,
                                                  watchlist,
                                                  early_stopping_rounds=120)

                                preds = model.predict(
                                    xgtest, ntree_limit=model.best_iteration)
                                # preds = model.predict(xgval, ntree_limit=model.best_iteration)

                                tp = normalized_gini(
                                    map(lambda x: math.exp(x) - 1, y_test),
                                    map(lambda x: math.exp(x) - 1, preds))

                                # tp = normalized_gini(y_train[:offset], preds)
                                score += [tp]
                                print tp

                            result += [
                                (np.mean(score), np.std(score),
                                 min_child_weight, eta, colsample_bytree,
                                 max_depth, subsample, gamma, n_iter)
                            ]

    result.sort()
    print result
Esempio n. 14
0
    # 'max_depth': 9
}

score = []
for train_index, test_index in rs:
    Xc_train = X_cat.values[train_index]
    Xc_test = X_cat.values[test_index]
    y_train = y.values[train_index]
    y_test = y.values[test_index]
    clf_cat = Ridge(normalize=True, alpha=0.1)
    clf_cat.fit(Xc_train, y_train)
    prediction_cat_test = clf_cat.predict(Xc_test)
    prediction_cat_train = clf_cat.predict(Xc_train)
    Xn_train = X_num.values[train_index]
    Xn_test = X_num.values[test_index]
    Xn_train = pd.DataFrame(Xn_train)
    Xn_test = pd.DataFrame(Xn_test)
    Xn_train['cat'] = prediction_cat_train
    Xn_test['cat'] = prediction_cat_test
    xgtrain = xgb.DMatrix(Xn_train, label=y_train)
    xgval = xgb.DMatrix(Xn_test, label=y_test)
    watchlist = [(xgtrain, 'train'), (xgval, 'val')]
    model = xgb.train(params,
                      xgtrain,
                      num_rounds,
                      watchlist,
                      early_stopping_rounds=200)
    preds = model.predict(xgval, ntree_limit=model.best_iteration)
    score += [normalized_gini(y_test, preds)]

print np.mean(score), np.std(score)
Esempio n. 15
0
                            b_test = y.values[test_index]

                            clf = RandomForestRegressor(
                                n_estimators=n_estimators,
                                min_samples_split=min_samples_split,
                                max_features=max_features,
                                max_depth=max_depth,
                                min_samples_leaf=min_samples_leaf,
                                n_jobs=-1,
                                random_state=random_state)

                            clf.fit(a_train, b_train)

                            preds = clf.predict(a_test)

                            score += [normalized_gini(b_test, preds)]

                        result += [(np.mean(score), np.std(score),
                                    n_estimators, min_samples_split,
                                    min_samples_leaf, max_depth, max_features)]

    result.sort()
    print result

elif ind == 3:
    clf = RandomForestRegressor(n_estimators=100,
                                min_samples_split=2,
                                max_features=0.4,
                                max_depth=7,
                                min_samples_leaf=1,
                                n_jobs=-1,
Esempio n. 16
0
prediction_cat_3 = clf_cat.predict(X3_cat)

X2_num = X2[features_num]
X2_num['cat'] = prediction_cat_2

X3_num = X3[features_num]
X3_num['cat'] = prediction_cat_3


xgtrain = xgb.DMatrix(X2_num, label=y2)
xgval = xgb.DMatrix(X3_num, label=y3)
watchlist = [(xgtrain, 'train'), (xgval, 'val')]
model = xgb.train(params, xgtrain, num_rounds, watchlist, early_stopping_rounds=200)
preds = model.predict(xgval, ntree_limit=model.best_iteration)

print normalized_gini(y3, preds)
# rs = cross_validation.StratifiedKFold(y, n_folds=n_iter, shuffle=True, random_state=random_state)
#
# num_rounds = 10000
# params = {
#   'objective': 'reg:linear',
#   # 'eta': 0.005,
#   # 'min_child_weight': 6,
#   # 'subsample': 0.7,
#   # 'colsabsample_bytree': 0.7,
#   # 'scal_pos_weight': 1,
#   'silent': 1,
#   # 'max_depth': 9
# }
#
# score = []
Esempio n. 17
0
    watchlist = [(xgtrain, 'train'), (xgval, 'val')]

    model = xgb.train(params_new,
                      xgtrain,
                      num_rounds,
                      watchlist,
                      early_stopping_rounds=120)

    preds2 = model.predict(xgtest, ntree_limit=model.best_iteration)

    preds_xgbt = 0.5 * preds1 + 0.5 * np.exp(preds2)

    alpha = 0
    prediction = preds_xgbt
    tp = normalized_gini(y_test, prediction)
    score_00 += [normalized_gini(y_test, prediction)]
    alpha = 0.1
    prediction = alpha * preds_RF + (1 - alpha) * preds_xgbt
    score_01 += [normalized_gini(y_test, prediction)]

    alpha = 0.2
    prediction = alpha * preds_RF + (1 - alpha) * preds_xgbt
    score_02 += [normalized_gini(y_test, prediction)]

    alpha = 0.3
    prediction = alpha * preds_RF + (1 - alpha) * preds_xgbt
    score_03 += [normalized_gini(y_test, prediction)]

    alpha = 0.4
    prediction = alpha * preds_RF + (1 - alpha) * preds_xgbt
Esempio n. 18
0
  # 'subsample': 0.7,
  # 'colsabsample_bytree': 0.7,
  # 'scal_pos_weight': 1,
  'silent': 1,
  # 'max_depth': 9
}

score = []
for train_index, test_index in rs:
  Xc_train = X_cat.values[train_index]
  Xc_test = X_cat.values[test_index]
  y_train = y.values[train_index]
  y_test = y.values[test_index]
  clf_cat = Ridge(normalize=True, alpha=0.1)
  clf_cat.fit(Xc_train, y_train)
  prediction_cat_test = clf_cat.predict(Xc_test)
  prediction_cat_train = clf_cat.predict(Xc_train)
  Xn_train = X_num.values[train_index]
  Xn_test = X_num.values[test_index]
  Xn_train = pd.DataFrame(Xn_train)
  Xn_test = pd.DataFrame(Xn_test)
  Xn_train['cat'] = prediction_cat_train
  Xn_test['cat'] = prediction_cat_test
  xgtrain = xgb.DMatrix(Xn_train, label=y_train)
  xgval = xgb.DMatrix(Xn_test, label=y_test)
  watchlist = [(xgtrain, 'train'), (xgval, 'val')]
  model = xgb.train(params, xgtrain, num_rounds, watchlist, early_stopping_rounds=200)
  preds = model.predict(xgval, ntree_limit=model.best_iteration)
  score += [normalized_gini(y_test, preds)]

print np.mean(score), np.std(score)
Esempio n. 19
0
X2_num['cat'] = prediction_cat_2

X3_num = X3[features_num]
X3_num['cat'] = prediction_cat_3

xgtrain = xgb.DMatrix(X2_num, label=y2)
xgval = xgb.DMatrix(X3_num, label=y3)
watchlist = [(xgtrain, 'train'), (xgval, 'val')]
model = xgb.train(params,
                  xgtrain,
                  num_rounds,
                  watchlist,
                  early_stopping_rounds=200)
preds = model.predict(xgval, ntree_limit=model.best_iteration)

print normalized_gini(y3, preds)
# rs = cross_validation.StratifiedKFold(y, n_folds=n_iter, shuffle=True, random_state=random_state)
#
# num_rounds = 10000
# params = {
#   'objective': 'reg:linear',
#   # 'eta': 0.005,
#   # 'min_child_weight': 6,
#   # 'subsample': 0.7,
#   # 'colsabsample_bytree': 0.7,
#   # 'scal_pos_weight': 1,
#   'silent': 1,
#   # 'max_depth': 9
# }
#
# score = []