Esempio n. 1
0
def get_submission(file_prefix, reserve_col):
    train_filename = file_prefix + 'cwrrsr_train.csv'
    test_filename = file_prefix + 'cwrrsr_predict.csv'
    train = pd.read_csv(train_filename)
    predict = pd.read_csv(test_filename)

    # make input
    train['visitors'] = np.log1p(train['visitors'])

    col = [
        'air_store_num',
        'visitors',
        'air_genre_num',
        "air_area_num",
        'year',
        'month',
        "moving_mean_0",
        "moving_median_0",
        "moving_max_0",
        "moving_std_0",
        "moving_mean_1",
        "moving_median_1",
        "moving_max_1",
        "moving_std_1",
        "moving_mean_3",
        "moving_median_3",
        "moving_max_3",
        "moving_std_3",
        "moving_mean_13",
        "moving_median_13",
        "moving_max_13",
        "moving_std_13",
        "dow_moving_mean_0",
        "dow_moving_median_0",
        "dow_moving_max_0",
        "dow_moving_mean_1",
        "dow_moving_median_1",
        "dow_moving_max_1",
        "dow_moving_std_1",
        "dow_moving_mean_3",
        "dow_moving_median_3",
        "dow_moving_max_3",
        "dow_moving_std_3",
        "dow_moving_mean_13",
        "dow_moving_median_13",
        "dow_moving_max_13",
        "dow_moving_std_13",
        "change_mean_0_1",
        "change_mean_0_3",
        "change_mean_0_13",  # small
        "change_mean_1_3",
        "change_mean_1_13",
        "change_mean_3_13",  # small
        "moving_skew_0",
        "moving_skew_1",
        "moving_skew_3",
        "moving_skew_13",
        "dow_moving_skew_1",
        "dow_moving_skew_3",
        "dow_moving_skew_13",
        'dow',
        'dowh',
        'dows',
        'holiday_flg',
        'week_hols',
        'next_week_hols',
        'prev_week_hols',
        "next_is_hol",
        'quarter_regress',
        'year_regress',
        "ewm",
        'quarter_regress_no_dow',
        'year_regress_no_dow',
        "precipitation",
        "avg_temperature",
        #"air_r_sum0_shifted", "hpg_r_sum0_shifted", "air_dow_r_sum0_shifted", "hpg_dow_r_sum0_shifted",
        #"air_r_sum7", "hpg_r_sum7",
    ]
    col.extend(reserve_col)

    # fit and predict
    period_list = [["2016-01-16", "2017-04-15", "2017-04-16", "2017-04-22"]]
    splits = pocket_split_train.split_set(train, period_list, col)
    models = pocket_split_train.split_train(splits)
    model = models[0]

    good_stores = train[train["first_appear"] < "2016-04-01"]
    good_store_list = good_stores["air_store_id"].unique()
    splits = pocket_split_train.split_set(good_stores, period_list, col)
    models = pocket_split_train.split_train(splits)
    good_store_model = models[0]
    print("-" * 40)

    # train.drop("visit_date", axis=1, inplace=True)
    predict.drop("visit_date", axis=1, inplace=True)

    x_pred = predict[col].drop('visitors', axis=1)
    y_pred = model.predict(x_pred, num_iteration=model.best_iteration)
    y_pred = np.expm1(y_pred)
    y_pred[y_pred < 1] = 1

    y_pred_good = good_store_model.predict(
        x_pred, num_iteration=good_store_model.best_iteration)
    y_pred_good = np.expm1(y_pred_good)
    y_pred_good[y_pred_good < 1] = 1

    #fi = pd.DataFrame({"name": model.feature_name(), "importance": model.feature_importance()})
    #fi = fi.sort_values(by="importance", ascending=False)
    #print(fi.head())

    # submit
    submission = pd.DataFrame({
        "id": predict['id'],
        "v_good": y_pred_good,
        "v_all": y_pred,
    })
    submission["air_store_id"] = submission['id'].map(
        lambda x: '_'.join(x.split('_')[:2]))
    submission["is_good_store"] = submission["air_store_id"].isin(
        good_store_list)
    submission["visit_date"] = submission['id'].map(
        lambda x: str(x).split('_')[2])
    submission["visitors"] = np.where(submission["is_good_store"],
                                      submission["v_good"],
                                      submission["v_all"])
    print("Done one sub.")
    print("-" * 30)

    return submission
Esempio n. 2
0
# print('-' * 30)

# fit and predict
# model = custom_lgb.fit(train_input, test_input)
period_list = [
    ["2016-01-16", "2017-04-08", "2017-04-16", "2017-04-22"],
    ["2016-01-16", "2017-04-01", "2017-04-09", "2017-04-15"],
    ["2016-01-16", "2017-03-26", "2017-04-02", "2017-04-08"],
    #["2016-01-16", "2016-04-17", "2016-04-18", "2016-04-24"],
    ["2016-01-16", "2017-03-04", "2017-03-12", "2017-03-19"],
]
#hmmm = train
#hmmm = hmmm[(hmmm["visit_date"] <= "2016-04-23") | (hmmm["visit_date"] >= "2016-05-15")]
#splits = pocket_split_train.split_set(hmmm, period_list, col)
splits = pocket_split_train.split_set(train, period_list, col)
models = pocket_split_train.split_train(splits)
model = models[0]
print("-" * 40)

# train.drop("visit_date", axis=1, inplace=True)
predict.drop("visit_date", axis=1, inplace=True)

x_pred = predict[col].drop('visitors', axis=1)
y_pred = model.predict(x_pred, num_iteration=model.best_iteration)
y_pred = np.expm1(y_pred)
y_pred[y_pred < 1] = 1


def save_models():
    pocket_full_of_validator.validate(
        train,
Esempio n. 3
0
import numpy as np
import pandas as pd

import custom_metrics
import custom_lgb
import pocket_split_train

# load data
train = pd.read_csv('../output/cleaned_train.csv')
predict = pd.read_csv('../output/cleaned_predict.csv')

# make input
train['visitors'] = np.log1p(train['visitors'])
period_split = [
    ['2016-01-10', '2016-04-23', '2016-04-24', '2016-05-31'],
    ['2016-01-10', '2016-09-03', '2016-09-04', '2016-10-08'],
    ['2016-01-10', '2016-11-26', '2017-01-16', '2017-03-05'],
]
models, score = pocket_split_train.split_train(train, period_split)

# print(train_input.head())
# print(test_input.head())
# print(x_pred.head())
# print('-' * 30)

exit(0)