def test_running(self):
        target = Thunderbolt(self._get_test_case_path(), use_cache=False)
        _ = Thunderbolt(self._get_test_case_path())
        output = Thunderbolt(self._get_test_case_path())

        for k, v in target.tasks.items():
            if k == 'last_modified':  # cache file
                continue
            self.assertEqual(v, output.tasks[k])

        output.client.local_cache.clear()
Esempio n. 2
0
# %%
from thunderbolt import Thunderbolt
import pandas as pd
import sys
import os
import matplotlib.pyplot as plt

sys.path.append(os.getcwd() + "/../..")

from kaggle_m5_forecasting.data.load_data import RawData
from kaggle_m5_forecasting.utils import decode_ids

tb = Thunderbolt("./../../resource")
data: pd.DataFrame = tb.get_data("MakeData")
data = decode_ids(data)
raw: RawData = tb.get_data("LoadRawData")

# %%
df = data.groupby("id")[["sell_price", "sales"]].mean().reset_index()

# %%

sell_price = data.groupby("item_id")["sell_price"].mean()

d_cols = [f"d_{d}" for d in range(1, 1942)]
date_idx = pd.to_datetime(raw.calendar["date"][:1941])
for i, s in enumerate(
        raw.sales_train_validation[raw.sales_train_validation.cat_id ==
                                   "FOODS"][["item_id"] + d_cols].groupby(
                                       "item_id").sum().reset_index().values):
    if i >= 150 and i <= 180:
import luigi
import numpy as np
import pandas as pd

import kaggle_disaster_tweets_gokart

luigi.configuration.LuigiConfigParser.add_config_path("./conf/param.ini")
np.random.seed(42)

# %%
# gokart.run(["tweet.MakeEnsembleModel", "--rerun"])

# %%
from thunderbolt import Thunderbolt

tb = Thunderbolt("./resource")
tb.get_task_df()

# %%
import numpy as np

# %%
import re
import regex
import swifter

df: pd.DataFrame = tb.get_data("MakeTrainSelectedFeatureData")

# %%

Esempio n. 4
0
# %%
import numpy as np

import sys
import os

sys.path.append(os.getcwd() + "/../..")
from kaggle_m5_forecasting.cv_result import CVResults
from kaggle_m5_forecasting.data.load_data import RawData
from kaggle_m5_forecasting.cv_dashboard import create_dashboard
from thunderbolt import Thunderbolt

TIMESTAMP = "2020-06-18_23:14:13"

cv = CVResults().from_timestamp(TIMESTAMP)
tb = Thunderbolt("./../../resource")
raw: RawData = tb.get_data("LoadRawData")

# %%
CV_NUM = 0
dir_name = f"../../output/cv/{TIMESTAMP}/{CV_NUM}"
evaluator = cv.results[CV_NUM].get_evaluator(raw)
# create_dashboard(
#     evaluator, raw, dir_name,
# )
print(np.mean(evaluator.all_scores))
# %%
import pandas as pd
from thunderbolt import Thunderbolt
import sys
import os

sys.path.append(os.getcwd() + "/../..")

from kaggle_m5_forecasting.data.load_data import RawData
from kaggle_m5_forecasting.data.fe_weather import read_weather_data
from kaggle_m5_forecasting.utils import decode_ids

tb = Thunderbolt("./../../resource")
data: pd.DataFrame = pd.concat(
    [tb.get_data("MakeData"),
     tb.get_data("FEWeather")], axis=1)
weather = read_weather_data("./../../external_data")
weather["date"] = pd.to_datetime(weather["date_time"]).dt.strftime("%Y-%m-%d")
weather.index = pd.to_datetime(weather["date_time"])
weather.index.name = None
weather.drop("date_time", axis=1, inplace=True)

raw: RawData = tb.get_data("LoadRawData")

# %%
raw.calendar["d"] = raw.calendar["d"].apply(lambda d: int(d.replace("d_", "")))
cat_id = 0
df: pd.DataFrame = data[data["cat_id"] == cat_id].groupby(
    ["d", "state_id"])["sales"].mean().reset_index().merge(
        raw.calendar[["d", "date"]], on="d",
        how="left").merge(weather, on=["date", "state_id"], how="left")
import matplotlib.pyplot as plt
import sklearn.metrics
import catch22
import seaborn as sns
from thunderbolt import Thunderbolt
import scipy
import sklearn.preprocessing
import sklearn.cluster

import sys
import os

sys.path.append(os.getcwd() + "/../..")
from kaggle_m5_forecasting.utils import timer

tb = Thunderbolt("./../../resource")
data: pd.DataFrame = tb.get_data("MakeData")
data = data[data.d < 1942]

# %%

with timer("calc grouped aggregates"):
    grouped = data.groupby(["id"])["sales"].agg({
        "mean":
        lambda x: x.dropna().values.mean(),
        "percentile25":
        lambda x: x.dropna().sort_values()[:int(len(x) * 0.25)].mean(),
        "percentile50":
        lambda x: x.dropna().sort_values()[int(len(x) * 0.25):int(
            len(x) * 0.5)].mean(),
        "percentile75":
Esempio n. 7
0
# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from thunderbolt import Thunderbolt
import sys
import os

sys.path.append(os.getcwd() + "/../..")
from kaggle_m5_forecasting.utils import timer

tb = Thunderbolt("./../../resource")
data: pd.DataFrame = tb.get_data("MakeData")

# %%

with timer("calc rolling_store_id_cat_id_mean"):
    lag = 28
    w_size = 30
    data["fe_rolling_store_id_cat_id_mean"] = data.groupby([
        "store_id", "cat_id"
    ])["sales"].transform(lambda x: x.shift(lag).rolling(w_size).mean())

# %%
tb = Thunderbolt("./../../resource")
tb.get_data("FERollingGroupMean")