Exemple #1
0
def lgb_train(lgb_model_output, init_model=None):
    train_embed, train_sales, test_embed = get_embed_features()
    train_sales = train_sales.reshape(-1)
    feature_names = get_embed_fea_names()
    assert len(feature_names) == train_embed.shape[1]

    if DO_EVAL:
        X_train, X_val, y_train, y_val = train_test_split(train_embed, train_sales, test_size=0.048, shuffle=False)
    else:
        X_train = train_embed
        y_train = train_sales
    X_test = test_embed
    print('X_train:', X_train.shape)
    print('y_train:', y_train.shape)

    lgb_train = lgb.Dataset(X_train, y_train, feature_name=feature_names)
    valid_sets = [lgb_train]
    valid_names = ['train']
    if DO_EVAL:
        lgb_eval = lgb.Dataset(X_val, y_val, feature_name=feature_names)
        valid_sets.append(lgb_eval)
        valid_names.append('eval')

    params = {
        'boosting_type': 'gbdt',
        'objective': 'regression_l1',
        'num_leaves': 31,
        'min_data_in_leaf': 100,
        'max_depth': -1,
        'learning_rate': 0.1,
        'feature_fraction': 0.95,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'max_bin': 255,
        'verbose': 0,
        'tree_learner': 'data',
        #'nthread': 4
    }
    bst = lgb.train(params,
                    lgb_train,
                    num_boost_round=5000,
                    valid_sets=valid_sets,
                    valid_names=valid_names,
                    feval=rmspe_lgb,
                    #early_stopping_rounds=100,
                    init_model=init_model,
                    verbose_eval=True,
                    )
    bst.save_model(lgb_model_output)
    importance = bst.feature_importance(importance_type='gain')
    importance_df = pd.DataFrame({'name': feature_names, 'importance': importance})
    importance_df.to_csv('importance.csv', index=False)

    if DO_PREDICT:
        submission_file = 'lgb_pred.csv'
        lgb_predict(lgb_model_output, X_test, submission_file)
        submit_to_kaggle(submission_file)
Exemple #2
0
def xgb_predict(model_file, submission_file):
    bst = xgb.Booster(model_file=model_file)
    train, test = load_data()
    X_test = extract_X(test)
    dtest = xgb.DMatrix(X_test)
    y_pred = bst.predict(dtest)
    y_pred = inverse_transform_y(y_pred)
    pred_df = pd.DataFrame({'Id': range(1, len(y_pred) + 1), 'Sales': y_pred})
    pred_df.to_csv(submission_file, index=False)
    submit_to_kaggle(submission_file)
Exemple #3
0
def xgb_train():

    train_embed, train_sales, test_embed = get_embed_features()
    train_sales = train_sales.reshape(-1)
    feature_names = get_embed_fea_names()
    assert len(feature_names) == train_embed.shape[1]

    if DO_EVAL:
        X_train, X_val, y_train, y_val = train_test_split(train_embed, train_sales, test_size=0.048, shuffle=False)
    else:
        X_train = train_embed
        y_train = train_sales
    X_test = test_embed
    print('X_train:', X_train.shape)
    print('y_train:', y_train.shape)

    dtrain = xgb.DMatrix(X_train, y_train, feature_names=feature_names)
    watchlist = [(dtrain, 'train')]
    if DO_EVAL:
        dval = xgb.DMatrix(X_val, y_val, feature_names=feature_names)
        watchlist.append((dval, 'eval'))
    #dtest = xgb.DMatrix(X_test)

    param = {
        'objective': 'reg:linear',
        'eval_metric': 'mae',
        'max_depth': 10,
        'gamma': 0,
        'min_child_weight': 1,
        'eta': 0.02,
        'subsample': 0.8,
        'colsample_bytree': 0.95,
        'tree_method': 'hist'
    }
    nrounds = 5000
    bst = xgb.train(param, dtrain, nrounds, watchlist,
                    early_stopping_rounds=100,
                    feval=rmspe_xgb,
                    #model_file='xgb_1.model'
                    )
    #bst = xgb.Booster(model_file='xgb_1.model')
    bst.save_model('xgb_1.model')
    if not DO_EVAL:
        dtest = xgb.DMatrix(X_test)
        y_pred = bst.predict(dtest)
        y_pred = inverse_transform_y(y_pred)
        df = pd.DataFrame({'Id': range(1, len(y_pred) + 1), 'Sales': y_pred})
        df.to_csv('./output/xgb_pred1.csv', index=False)
        submit_to_kaggle('./output/xgb_pred1.csv')
Exemple #4
0
def main():
    train_set, valid_set, X_test = get_dataset(VALIDATION_WEEKS, FILT_STORES_FOR_TRAIN, FILT_STORES_FOR_VALID)
    models = []
    for i in range(N_NETWORKS):
        print('\nNN_Embedding Model', i+1)
        print('-'*50)
        model = MODEL(print_model_summary=PRINT_SUMMARY, save_checkpt=SAVE_CHECKPT)

        if INIT_EPOCH > 0:
            model.model.load_weights(saved_model_file)
        model.fit(train_set, valid_set, batch_size=BATCH_SIZE, epochs=EPOCHS + INIT_EPOCH, init_epoch=INIT_EPOCH)

        if SAVE_MODEL:
            print('saving model', i+1)
            model.model.save(model_dir + 'model_'+str(i+1)+'.hdf5')

        models.append(model)

        # models.extend(load_models_from_hdf(['./output/checkpt/weights.{:02d}.hdf5'.format(i)
        #                                     for i in range(EPOCHS-2, EPOCHS)]))

    if DO_EVAL:
        errors = [model.eval() for model in models]
        for i,err in enumerate(errors):
            print('Model {}: RMSPE = {}'.format(i+1, err))
        print('Mean:', np.mean(errors))
        for i, model in enumerate(models):
            model.plot_loss('./output/loss_model_{}.png'.format(i+1))

    if SUBMIT:
        if len(sys.argv) > 1:
            filename = './output/'+sys.argv[1]
        else:
            filename = submission_file
        write_submission(models, X_test, filename)
        submit_to_kaggle(filename)

        if N_NETWORKS > 1:
            for i in range(N_NETWORKS):
                filename = '{}_model_{}.csv'.format(submission_file[:-4], i+1)
                write_submission([models[i]], X_test, filename)
                submit_to_kaggle(filename)
Exemple #5
0
# import os
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
# os.environ["CUDA_VISIBLE_DEVICES"] = ""

from ross_util import write_submission, submit_to_kaggle
from ross_main import load_models_from_hdf
from ross_data import load_data

train, test = load_data()
X_test = test
#model_files = ['./output/checkpt/weights.12.hdf5']
#model_files = ['./output/checkpt/weights.{:02d}.hdf5'.format(i) for i in range(20, 25)]
model_files = ['./models/model_{}.hdf5'.format(i + 1) for i in range(10)]
submission_file = './output/submission.10models.csv'

#model_files = ['./models/model_{}.hdf5'.format(i+1)  for i in range(10)]

models = load_models_from_hdf(model_files)
write_submission(models, X_test, submission_file)
submit_to_kaggle(submission_file, message='')
# for i, model in enumerate(models):
#     filename = './output/pred_model_{}.csv'.format(i+1)
#     write_submission([model], X_test, filename)
#     submit_to_kaggle(filename, message='')

# from keras.utils import plot_model
# plot_model(models[0].model, to_file='score294_model.png', show_shapes=True)