def test_running(self): target = Thunderbolt(self._get_test_case_path(), use_cache=False) _ = Thunderbolt(self._get_test_case_path()) output = Thunderbolt(self._get_test_case_path()) for k, v in target.tasks.items(): if k == 'last_modified': # cache file continue self.assertEqual(v, output.tasks[k]) output.client.local_cache.clear()
# %% from thunderbolt import Thunderbolt import pandas as pd import sys import os import matplotlib.pyplot as plt sys.path.append(os.getcwd() + "/../..") from kaggle_m5_forecasting.data.load_data import RawData from kaggle_m5_forecasting.utils import decode_ids tb = Thunderbolt("./../../resource") data: pd.DataFrame = tb.get_data("MakeData") data = decode_ids(data) raw: RawData = tb.get_data("LoadRawData") # %% df = data.groupby("id")[["sell_price", "sales"]].mean().reset_index() # %% sell_price = data.groupby("item_id")["sell_price"].mean() d_cols = [f"d_{d}" for d in range(1, 1942)] date_idx = pd.to_datetime(raw.calendar["date"][:1941]) for i, s in enumerate( raw.sales_train_validation[raw.sales_train_validation.cat_id == "FOODS"][["item_id"] + d_cols].groupby( "item_id").sum().reset_index().values): if i >= 150 and i <= 180:
import luigi import numpy as np import pandas as pd import kaggle_disaster_tweets_gokart luigi.configuration.LuigiConfigParser.add_config_path("./conf/param.ini") np.random.seed(42) # %% # gokart.run(["tweet.MakeEnsembleModel", "--rerun"]) # %% from thunderbolt import Thunderbolt tb = Thunderbolt("./resource") tb.get_task_df() # %% import numpy as np # %% import re import regex import swifter df: pd.DataFrame = tb.get_data("MakeTrainSelectedFeatureData") # %%
# %% import numpy as np import sys import os sys.path.append(os.getcwd() + "/../..") from kaggle_m5_forecasting.cv_result import CVResults from kaggle_m5_forecasting.data.load_data import RawData from kaggle_m5_forecasting.cv_dashboard import create_dashboard from thunderbolt import Thunderbolt TIMESTAMP = "2020-06-18_23:14:13" cv = CVResults().from_timestamp(TIMESTAMP) tb = Thunderbolt("./../../resource") raw: RawData = tb.get_data("LoadRawData") # %% CV_NUM = 0 dir_name = f"../../output/cv/{TIMESTAMP}/{CV_NUM}" evaluator = cv.results[CV_NUM].get_evaluator(raw) # create_dashboard( # evaluator, raw, dir_name, # ) print(np.mean(evaluator.all_scores))
# %% import pandas as pd from thunderbolt import Thunderbolt import sys import os sys.path.append(os.getcwd() + "/../..") from kaggle_m5_forecasting.data.load_data import RawData from kaggle_m5_forecasting.data.fe_weather import read_weather_data from kaggle_m5_forecasting.utils import decode_ids tb = Thunderbolt("./../../resource") data: pd.DataFrame = pd.concat( [tb.get_data("MakeData"), tb.get_data("FEWeather")], axis=1) weather = read_weather_data("./../../external_data") weather["date"] = pd.to_datetime(weather["date_time"]).dt.strftime("%Y-%m-%d") weather.index = pd.to_datetime(weather["date_time"]) weather.index.name = None weather.drop("date_time", axis=1, inplace=True) raw: RawData = tb.get_data("LoadRawData") # %% raw.calendar["d"] = raw.calendar["d"].apply(lambda d: int(d.replace("d_", ""))) cat_id = 0 df: pd.DataFrame = data[data["cat_id"] == cat_id].groupby( ["d", "state_id"])["sales"].mean().reset_index().merge( raw.calendar[["d", "date"]], on="d", how="left").merge(weather, on=["date", "state_id"], how="left")
import matplotlib.pyplot as plt import sklearn.metrics import catch22 import seaborn as sns from thunderbolt import Thunderbolt import scipy import sklearn.preprocessing import sklearn.cluster import sys import os sys.path.append(os.getcwd() + "/../..") from kaggle_m5_forecasting.utils import timer tb = Thunderbolt("./../../resource") data: pd.DataFrame = tb.get_data("MakeData") data = data[data.d < 1942] # %% with timer("calc grouped aggregates"): grouped = data.groupby(["id"])["sales"].agg({ "mean": lambda x: x.dropna().values.mean(), "percentile25": lambda x: x.dropna().sort_values()[:int(len(x) * 0.25)].mean(), "percentile50": lambda x: x.dropna().sort_values()[int(len(x) * 0.25):int( len(x) * 0.5)].mean(), "percentile75":
# %% import pandas as pd import numpy as np import matplotlib.pyplot as plt from thunderbolt import Thunderbolt import sys import os sys.path.append(os.getcwd() + "/../..") from kaggle_m5_forecasting.utils import timer tb = Thunderbolt("./../../resource") data: pd.DataFrame = tb.get_data("MakeData") # %% with timer("calc rolling_store_id_cat_id_mean"): lag = 28 w_size = 30 data["fe_rolling_store_id_cat_id_mean"] = data.groupby([ "store_id", "cat_id" ])["sales"].transform(lambda x: x.shift(lag).rolling(w_size).mean()) # %% tb = Thunderbolt("./../../resource") tb.get_data("FERollingGroupMean")