def get_submission(file_prefix, reserve_col): train_filename = file_prefix + 'cwrrsr_train.csv' test_filename = file_prefix + 'cwrrsr_predict.csv' train = pd.read_csv(train_filename) predict = pd.read_csv(test_filename) # make input train['visitors'] = np.log1p(train['visitors']) col = [ 'air_store_num', 'visitors', 'air_genre_num', "air_area_num", 'year', 'month', "moving_mean_0", "moving_median_0", "moving_max_0", "moving_std_0", "moving_mean_1", "moving_median_1", "moving_max_1", "moving_std_1", "moving_mean_3", "moving_median_3", "moving_max_3", "moving_std_3", "moving_mean_13", "moving_median_13", "moving_max_13", "moving_std_13", "dow_moving_mean_0", "dow_moving_median_0", "dow_moving_max_0", "dow_moving_mean_1", "dow_moving_median_1", "dow_moving_max_1", "dow_moving_std_1", "dow_moving_mean_3", "dow_moving_median_3", "dow_moving_max_3", "dow_moving_std_3", "dow_moving_mean_13", "dow_moving_median_13", "dow_moving_max_13", "dow_moving_std_13", "change_mean_0_1", "change_mean_0_3", "change_mean_0_13", # small "change_mean_1_3", "change_mean_1_13", "change_mean_3_13", # small "moving_skew_0", "moving_skew_1", "moving_skew_3", "moving_skew_13", "dow_moving_skew_1", "dow_moving_skew_3", "dow_moving_skew_13", 'dow', 'dowh', 'dows', 'holiday_flg', 'week_hols', 'next_week_hols', 'prev_week_hols', "next_is_hol", 'quarter_regress', 'year_regress', "ewm", 'quarter_regress_no_dow', 'year_regress_no_dow', "precipitation", "avg_temperature", #"air_r_sum0_shifted", "hpg_r_sum0_shifted", "air_dow_r_sum0_shifted", "hpg_dow_r_sum0_shifted", #"air_r_sum7", "hpg_r_sum7", ] col.extend(reserve_col) # fit and predict period_list = [["2016-01-16", "2017-04-15", "2017-04-16", "2017-04-22"]] splits = pocket_split_train.split_set(train, period_list, col) models = pocket_split_train.split_train(splits) model = models[0] good_stores = train[train["first_appear"] < "2016-04-01"] good_store_list = good_stores["air_store_id"].unique() splits = pocket_split_train.split_set(good_stores, period_list, col) models = pocket_split_train.split_train(splits) good_store_model = models[0] print("-" * 40) # train.drop("visit_date", axis=1, inplace=True) predict.drop("visit_date", axis=1, inplace=True) x_pred = predict[col].drop('visitors', axis=1) y_pred = model.predict(x_pred, num_iteration=model.best_iteration) y_pred = np.expm1(y_pred) y_pred[y_pred < 1] = 1 y_pred_good = good_store_model.predict( x_pred, num_iteration=good_store_model.best_iteration) y_pred_good = np.expm1(y_pred_good) y_pred_good[y_pred_good < 1] = 1 #fi = pd.DataFrame({"name": model.feature_name(), "importance": model.feature_importance()}) #fi = fi.sort_values(by="importance", ascending=False) #print(fi.head()) # submit submission = pd.DataFrame({ "id": predict['id'], "v_good": y_pred_good, "v_all": y_pred, }) submission["air_store_id"] = submission['id'].map( lambda x: '_'.join(x.split('_')[:2])) submission["is_good_store"] = submission["air_store_id"].isin( good_store_list) submission["visit_date"] = submission['id'].map( lambda x: str(x).split('_')[2]) submission["visitors"] = np.where(submission["is_good_store"], submission["v_good"], submission["v_all"]) print("Done one sub.") print("-" * 30) return submission
# print('-' * 30) # fit and predict # model = custom_lgb.fit(train_input, test_input) period_list = [ ["2016-01-16", "2017-04-08", "2017-04-16", "2017-04-22"], ["2016-01-16", "2017-04-01", "2017-04-09", "2017-04-15"], ["2016-01-16", "2017-03-26", "2017-04-02", "2017-04-08"], #["2016-01-16", "2016-04-17", "2016-04-18", "2016-04-24"], ["2016-01-16", "2017-03-04", "2017-03-12", "2017-03-19"], ] #hmmm = train #hmmm = hmmm[(hmmm["visit_date"] <= "2016-04-23") | (hmmm["visit_date"] >= "2016-05-15")] #splits = pocket_split_train.split_set(hmmm, period_list, col) splits = pocket_split_train.split_set(train, period_list, col) models = pocket_split_train.split_train(splits) model = models[0] print("-" * 40) # train.drop("visit_date", axis=1, inplace=True) predict.drop("visit_date", axis=1, inplace=True) x_pred = predict[col].drop('visitors', axis=1) y_pred = model.predict(x_pred, num_iteration=model.best_iteration) y_pred = np.expm1(y_pred) y_pred[y_pred < 1] = 1 def save_models(): pocket_full_of_validator.validate( train,
import numpy as np import pandas as pd import custom_metrics import custom_lgb import pocket_split_train # load data train = pd.read_csv('../output/cleaned_train.csv') predict = pd.read_csv('../output/cleaned_predict.csv') # make input train['visitors'] = np.log1p(train['visitors']) period_split = [ ['2016-01-10', '2016-04-23', '2016-04-24', '2016-05-31'], ['2016-01-10', '2016-09-03', '2016-09-04', '2016-10-08'], ['2016-01-10', '2016-11-26', '2017-01-16', '2017-03-05'], ] models, score = pocket_split_train.split_train(train, period_split) # print(train_input.head()) # print(test_input.head()) # print(x_pred.head()) # print('-' * 30) exit(0)