def main(debug=False, use_pkl=False): num_rows = 10000 if debug else None if use_pkl: df = loadpkl('../output/df.pkl') else: with timer("train & test"): df = train_test(num_rows) with timer("nightley"): df = pd.merge(df, nightley(num_rows), on=['datetime', 'park'], how='outer') with timer("hotlink"): df = pd.merge(df, hotlink(num_rows), on='datetime', how='outer') with timer("colopl"): df = pd.merge(df, colopl(num_rows), on=['park', 'year', 'month'], how='outer') with timer("weather"): df = pd.merge(df, weather(num_rows), on=['datetime', 'park'], how='outer') with timer("nied_oyama"): df = pd.merge(df, nied_oyama(num_rows), on=['datetime', 'park'], how='outer') with timer("agoop"): df = pd.merge(df, agoop(num_rows), on=['park', 'year','month'], how='outer') with timer("jorudan"): df = pd.merge(df, jorudan(num_rows), on=['datetime', 'park'], how='outer') with timer("save pkl"): save2pkl('../output/df.pkl', df) with timer("Run XGBoost with kfold"): print("df shape:", df.shape) feat_importance = kfold_xgboost(df, num_folds=NUM_FOLDS, stratified=True, debug=debug) display_importances(feat_importance ,'../output/xgb_importances.png', '../output/feature_importance_xgb.csv')
USE_PKL = True if USE_PKL: DF = loadpkl('../output/df.pkl') else: DF = train_test(NUM_ROWS) DF = pd.merge(DF, nightley(NUM_ROWS), on=['datetime', 'park'], how='outer') DF = pd.merge(DF, hotlink(NUM_ROWS), on='datetime', how='outer') DF = pd.merge(DF, colopl(NUM_ROWS), on=['year', 'month'], how='outer') DF = pd.merge(DF, weather(NUM_ROWS), on=['datetime', 'park'], how='outer') DF = pd.merge(DF, nied_oyama(NUM_ROWS), on=['datetime', 'park'], how='outer') DF = pd.merge(DF, agoop(num_rows), on=['park', 'year', 'month'], how='outer') DF = pd.merge(DF, jorudan(num_rows), on=['datetime', 'park'], how='outer') # split test & train TRAIN_DF = DF[DF['visitors'].notnull()] FEATS = [f for f in TRAIN_DF.columns if f not in FEATS_EXCLUDED] def objective(trial): lgbm_train = lightgbm.Dataset(TRAIN_DF[FEATS], np.log1p(TRAIN_DF['visitors']), free_raw_data=False) # num_round = trial.suggest_int('num_round', 1, 500)