def lgb_train(lgb_model_output, init_model=None): train_embed, train_sales, test_embed = get_embed_features() train_sales = train_sales.reshape(-1) feature_names = get_embed_fea_names() assert len(feature_names) == train_embed.shape[1] if DO_EVAL: X_train, X_val, y_train, y_val = train_test_split(train_embed, train_sales, test_size=0.048, shuffle=False) else: X_train = train_embed y_train = train_sales X_test = test_embed print('X_train:', X_train.shape) print('y_train:', y_train.shape) lgb_train = lgb.Dataset(X_train, y_train, feature_name=feature_names) valid_sets = [lgb_train] valid_names = ['train'] if DO_EVAL: lgb_eval = lgb.Dataset(X_val, y_val, feature_name=feature_names) valid_sets.append(lgb_eval) valid_names.append('eval') params = { 'boosting_type': 'gbdt', 'objective': 'regression_l1', 'num_leaves': 31, 'min_data_in_leaf': 100, 'max_depth': -1, 'learning_rate': 0.1, 'feature_fraction': 0.95, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'max_bin': 255, 'verbose': 0, 'tree_learner': 'data', #'nthread': 4 } bst = lgb.train(params, lgb_train, num_boost_round=5000, valid_sets=valid_sets, valid_names=valid_names, feval=rmspe_lgb, #early_stopping_rounds=100, init_model=init_model, verbose_eval=True, ) bst.save_model(lgb_model_output) importance = bst.feature_importance(importance_type='gain') importance_df = pd.DataFrame({'name': feature_names, 'importance': importance}) importance_df.to_csv('importance.csv', index=False) if DO_PREDICT: submission_file = 'lgb_pred.csv' lgb_predict(lgb_model_output, X_test, submission_file) submit_to_kaggle(submission_file)
def xgb_predict(model_file, submission_file): bst = xgb.Booster(model_file=model_file) train, test = load_data() X_test = extract_X(test) dtest = xgb.DMatrix(X_test) y_pred = bst.predict(dtest) y_pred = inverse_transform_y(y_pred) pred_df = pd.DataFrame({'Id': range(1, len(y_pred) + 1), 'Sales': y_pred}) pred_df.to_csv(submission_file, index=False) submit_to_kaggle(submission_file)
def xgb_train(): train_embed, train_sales, test_embed = get_embed_features() train_sales = train_sales.reshape(-1) feature_names = get_embed_fea_names() assert len(feature_names) == train_embed.shape[1] if DO_EVAL: X_train, X_val, y_train, y_val = train_test_split(train_embed, train_sales, test_size=0.048, shuffle=False) else: X_train = train_embed y_train = train_sales X_test = test_embed print('X_train:', X_train.shape) print('y_train:', y_train.shape) dtrain = xgb.DMatrix(X_train, y_train, feature_names=feature_names) watchlist = [(dtrain, 'train')] if DO_EVAL: dval = xgb.DMatrix(X_val, y_val, feature_names=feature_names) watchlist.append((dval, 'eval')) #dtest = xgb.DMatrix(X_test) param = { 'objective': 'reg:linear', 'eval_metric': 'mae', 'max_depth': 10, 'gamma': 0, 'min_child_weight': 1, 'eta': 0.02, 'subsample': 0.8, 'colsample_bytree': 0.95, 'tree_method': 'hist' } nrounds = 5000 bst = xgb.train(param, dtrain, nrounds, watchlist, early_stopping_rounds=100, feval=rmspe_xgb, #model_file='xgb_1.model' ) #bst = xgb.Booster(model_file='xgb_1.model') bst.save_model('xgb_1.model') if not DO_EVAL: dtest = xgb.DMatrix(X_test) y_pred = bst.predict(dtest) y_pred = inverse_transform_y(y_pred) df = pd.DataFrame({'Id': range(1, len(y_pred) + 1), 'Sales': y_pred}) df.to_csv('./output/xgb_pred1.csv', index=False) submit_to_kaggle('./output/xgb_pred1.csv')
def main(): train_set, valid_set, X_test = get_dataset(VALIDATION_WEEKS, FILT_STORES_FOR_TRAIN, FILT_STORES_FOR_VALID) models = [] for i in range(N_NETWORKS): print('\nNN_Embedding Model', i+1) print('-'*50) model = MODEL(print_model_summary=PRINT_SUMMARY, save_checkpt=SAVE_CHECKPT) if INIT_EPOCH > 0: model.model.load_weights(saved_model_file) model.fit(train_set, valid_set, batch_size=BATCH_SIZE, epochs=EPOCHS + INIT_EPOCH, init_epoch=INIT_EPOCH) if SAVE_MODEL: print('saving model', i+1) model.model.save(model_dir + 'model_'+str(i+1)+'.hdf5') models.append(model) # models.extend(load_models_from_hdf(['./output/checkpt/weights.{:02d}.hdf5'.format(i) # for i in range(EPOCHS-2, EPOCHS)])) if DO_EVAL: errors = [model.eval() for model in models] for i,err in enumerate(errors): print('Model {}: RMSPE = {}'.format(i+1, err)) print('Mean:', np.mean(errors)) for i, model in enumerate(models): model.plot_loss('./output/loss_model_{}.png'.format(i+1)) if SUBMIT: if len(sys.argv) > 1: filename = './output/'+sys.argv[1] else: filename = submission_file write_submission(models, X_test, filename) submit_to_kaggle(filename) if N_NETWORKS > 1: for i in range(N_NETWORKS): filename = '{}_model_{}.csv'.format(submission_file[:-4], i+1) write_submission([models[i]], X_test, filename) submit_to_kaggle(filename)
# import os # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #152 # os.environ["CUDA_VISIBLE_DEVICES"] = "" from ross_util import write_submission, submit_to_kaggle from ross_main import load_models_from_hdf from ross_data import load_data train, test = load_data() X_test = test #model_files = ['./output/checkpt/weights.12.hdf5'] #model_files = ['./output/checkpt/weights.{:02d}.hdf5'.format(i) for i in range(20, 25)] model_files = ['./models/model_{}.hdf5'.format(i + 1) for i in range(10)] submission_file = './output/submission.10models.csv' #model_files = ['./models/model_{}.hdf5'.format(i+1) for i in range(10)] models = load_models_from_hdf(model_files) write_submission(models, X_test, submission_file) submit_to_kaggle(submission_file, message='') # for i, model in enumerate(models): # filename = './output/pred_model_{}.csv'.format(i+1) # write_submission([model], X_test, filename) # submit_to_kaggle(filename, message='') # from keras.utils import plot_model # plot_model(models[0].model, to_file='score294_model.png', show_shapes=True)