def run(self): config = Config() run_name = get_run_name() splits: List[Split] = self.load("splits") splits = delete_unused_features(splits) if config.DROP_OUTLIERS: splits = drop_outliers(splits) experiment_id = start_mlflow("cv_cls") mlflow.start_run(experiment_id=experiment_id, run_name=run_name) timestamp = mlflow.active_run().info.start_time / 1000 start_time = datetime.datetime.fromtimestamp(timestamp).strftime( "%Y-%m-%d_%H:%M:%S") log_params() for cv_num, sp in enumerate(splits): file_dir = f"./output/cv_cls/{start_time}/{cv_num}" Path(file_dir).mkdir(parents=True, exist_ok=True) train_set, val_set = convert_to_lgb_dataset(sp, cv_num) model = train_cls(cv_num, config.lgbm_cls_params, train_set, [val_set], 10, 20) df_val = predict_cls(sp, cv_num, model, val_set) pickle.dump(model, open(f"{file_dir}/model.pkl", "wb")) pickle.dump( df_val, open(f"{file_dir}/df_val.pkl", "wb"), ) mlflow.end_run()
def log_metrics( cv_num: int, start_time: str, raw: RawData, test_pred: pd.DataFrame, test_true: pd.DataFrame, ) -> Tuple[float, float, float]: config = Config() d_start = config.CV_START_DAYS[cv_num] d_end = config.CV_START_DAYS[cv_num] + 28 cv_result = CVResult( cv_num=cv_num, config=config, test_pred=test_pred[(test_pred.d >= d_start) & (test_pred.d < d_end)], ) evaluator = cv_result.get_evaluator(raw) cv_result.create_dashboard(raw, f"./output/cv/{start_time}/{cv_num}") y_pred = test_pred[(test_pred.d >= d_start) & (test_pred.d < d_end)][config.TARGET] y_true = test_true[(test_true.d >= d_start) & (test_true.d < d_end)][config.TARGET] wrmsse = np.mean(evaluator.all_scores) rmse = np.sqrt(sklearn.metrics.mean_squared_error(y_true, y_pred)) mae = sklearn.metrics.mean_absolute_error(y_true, y_pred) print(f"==========CV No: {cv_num}=================") print("WRMSSE", wrmsse) print("RMSE", rmse) print("MAE", mae) print("=================================") mlflow.log_metric(f"WRMSSE_{cv_num}", wrmsse) mlflow.log_metric(f"RMSE_{cv_num}", rmse) mlflow.log_metric(f"MAE_{cv_num}", mae) return wrmsse, rmse, mae
def run(self): config = Config() data: pd.DataFrame = self.load() train_df: pd.DataFrame = data[(data.d > config.START_DAY) & (data.d <= 1913)] result: List[Tuple[List[str], pd.DataFrame]] = target_encoding_catch22(train_df) self.dump(result)
def run(self): config = Config() data: pd.DataFrame = self.load("data") results: List[List[Tuple[List[str], pd.DataFrame]]] = [] for end_day in config.CV_START_DAYS: train_df: pd.DataFrame = data[(data.d > config.START_DAY) & (data.d < end_day)] results.append(target_encoding_catch22(train_df)) self.dump(results)
def predict_cls( sp: Split, cv_num: int, model: LGBMClassifier, val_set: lgb.Dataset ) -> pd.DataFrame: config = Config() df_val = sp.test[ (sp.test.d >= config.CV_START_DAYS[cv_num]) & (sp.test.d < config.CV_START_DAYS[cv_num] + 28) ][["id", "d", "sales_is_zero"]] df_val["sales_is_zero_pred"] = model.predict_proba(val_set.data)[:, 1] return df_val
def delete_unused_features(splits: List[Split]) -> List[Split]: config = Config() for i in range(len(splits)): splits[i].train = splits[i].train[["id", "d", config.TARGET] + config.features] splits[i].test = splits[i].test[["id", "d", config.TARGET] + config.features] splits[i].train = splits[i].train[splits[i].train["d"] >= config.START_DAY] print(f"CV{i} train shape:", splits[i].train.shape) if config.DROP_NA: splits[i].train = splits[i].train.dropna() print(f"CV{i} NA dropped train shape:", splits[i].train.shape) return splits
def cls_postprocessing(cv_num: int, test_pred: pd.DataFrame) -> pd.DataFrame: with timer("cls_postprocessing"): config = Config() df_val: pd.dataframe = pickle.load( open(f"./output/cv_cls/{config.CLS_TIMESTAMP}/0/df_val.pkl", "rb") ) test_pred["tmp_id"] = ( test_pred["id"].astype(str) + "_" + test_pred["d"].astype(str) ) df_val = df_val[df_val["sales_is_zero_pred"] >= config.CLS_THRESHOLD] tmp_ids = df_val["id"].astype(str) + "_" + df_val["d"].astype(str) test_pred.loc[test_pred["tmp_id"].isin(tmp_ids), "sales"] = 0 test_pred.drop(["tmp_id"], axis=1, inplace=True) return test_pred
def convert_to_lgb_dataset(sp: Split, cv_num: int) -> Tuple[lgb.Dataset, lgb.Dataset]: config = Config() train_set = lgb.Dataset(sp.train[config.features], sp.train[config.TARGET]) val_set = lgb.Dataset( sp.test[ (sp.test.d >= config.CV_START_DAYS[cv_num]) & (sp.test.d < config.CV_START_DAYS[cv_num] + 28) ][config.features], sp.test[ (sp.test.d >= config.CV_START_DAYS[cv_num]) & (sp.test.d < config.CV_START_DAYS[cv_num] + 28) ][config.TARGET], ) return train_set, val_set
def run(self): data: pd.DataFrame = self.load() sp_idx: SplitIndex = SplitIndex() config = Config() sp_idx.train = list(data[(data.d >= config.START_DAY) & (data.d <= 1913)].index) sp_idx.test = list(data[(data.d > 1913 - config.MAX_LAGS)].index) print("train shape:", data.iloc[sp_idx.train, :].shape) print("test shape:", data.iloc[sp_idx.test, :].shape) self.dump(sp_idx)
def log_params(): config = Config() mlflow.lightgbm.autolog() mlflow.log_param("MIN_SUM", config.MIN_SUM) mlflow.log_param("MAX_LAGS", config.MAX_LAGS) mlflow.log_param("start_day", config.START_DAY) mlflow.log_param("SEED", config.SEED) mlflow.log_param("DROP_NA", config.DROP_NA) mlflow.log_param("DROP_OUTLIERS", config.DROP_OUTLIERS) mlflow.log_param("CV_SAMPLE_RATE", config.CV_SAMPLE_RATE) mlflow.log_param("MODEL", config.MODEL) mlflow.log_param("CLS_POSTPROCESSING", config.CLS_POSTPROCESSING) mlflow.log_param("CLS_TIMESTAMP", config.CLS_TIMESTAMP) mlflow.log_param("CLS_THRESHOLD", config.CLS_THRESHOLD) mlflow.log_param("features", ",\n".join([f"'{f}'" for f in config.features]))
def run(self): config = Config() data: pd.DataFrame = pd.concat( [self.load("data"), self.load("fe_event")], axis=1) train_df: pd.DataFrame = data[(data.d > config.START_DAY) & (data.d <= 1913)] # train_df = train_df.sample(int(len(train_df) * 0.15)) with timer("create grouped df"): grouped: List[Tuple[List[str], pd.DataFrame]] = target_encoding(train_df) with timer("merge into data"): for group_key, grouped_df in tqdm(grouped): data = data.merge(grouped_df, on=group_key, how="left") df = reduce_mem_usage(data.filter(like="fe_te_")) print(df.info()) self.dump(df)
def run(self): data: pd.DataFrame = self.load() sp_idxs: List[SplitIndex] = [] config = Config() np.random.seed(config.SEED) random.seed(config.SEED) for cv_start_day in tqdm(config.CV_START_DAYS): sp_idx: SplitIndex = SplitIndex() train_df = data[data.d < cv_start_day] sp_idx.train = list( # train_df.sample(int(len(data) * config.CV_SAMPLE_RATE)).index train_df.index) sp_idx.test = list(data[(data.d >= cv_start_day - config.MAX_LAGS) & (data.d < cv_start_day + 28)].index) sp_idxs.append(sp_idx) self.dump(sp_idxs)
def run(self): with timer("combine val features"): with timer("concat features"): data: pd.DataFrame = pd.concat( [ self.load("data"), self.load("fe_price_rolling"), self.load("fe_price_change"), self.load("fe_price_basic"), self.load("fe_shift"), self.load("fe_rolling_mean"), self.load("fe_rolling_dw_mean"), self.load("fe_rolling_group_mean"), self.load("fe_rolling_group_std"), self.load("fe_rolling_std"), self.load("fe_rolling_skew"), self.load("fe_rolling_kurt"), self.load("fe_weather"), self.load("fe_unemployment"), self.load("fe_stock"), self.load("fe_event"), self.load("fe_event_strength"), self.load("fe_catch22_pca"), ], axis=1, ) with timer("merge target features"): config = Config() te_val_data: List[pd.DataFrame] = self.load("te_val_data") splits: List[Split] = [] sp_idxs: List[SplitIndex] = self.load("sp_idxs") for i in tqdm(range(len(sp_idxs))): sp: Split = Split() data = pd.concat([data, te_val_data[i]], axis=1) sp.train = data.iloc[sp_idxs[i].train, :] sp.test = data.iloc[sp_idxs[i].test, :] if config.CV_SAMPLE_RATE != 1: sp.train = sp.train.sample( int(len(sp.train) * config.CV_SAMPLE_RATE)) splits.append(sp) print(sp.train.info()) data = data.drop(list(data.filter(like="fe_te_").columns), axis=1) self.dump(splits)
def predict( cv_num: int, sp: Split, model: lgb.Booster, model_number: Optional[int] = None ) -> pd.DataFrame: config = Config() d_start: int = config.CV_START_DAYS[cv_num] d_end: int = config.CV_START_DAYS[cv_num] + 28 test_pred = sp.test.copy() test_pred[config.TARGET + "_true"] = test_pred[config.TARGET] test_pred.loc[test_pred.d >= d_start, config.TARGET] = np.nan for d in tqdm(range(d_start, d_end)): test_pred = make_rolling_for_test(test_pred, d, config.features) test_pred.loc[test_pred.d == d, config.TARGET] = model.predict( test_pred.loc[test_pred.d == d, config.features] ) test_pred.loc[test_pred.d == d, "sales_is_zero"] = ( test_pred.loc[test_pred.d == d, "sales"] == 0 ).astype(np.int8) return test_pred
def run(self): config = Config() data: pd.DataFrame = pd.concat( [self.load("data"), self.load("fe_event")], axis=1) dfs: List[pd.DataFrame] = [] for end_day in config.CV_START_DAYS: with timer("create grouped df"): # train_df: pd.DataFrame = data[ # (data.d > config.START_DAY) & (data.d < end_day) # ] train_df: pd.DataFrame = data[data.d < end_day] grouped: List[Tuple[List[str], pd.DataFrame]] = target_encoding(train_df) with timer("merge into data"): df = data.copy() for group_key, grouped_df in tqdm(grouped): df = df.merge(grouped_df, on=group_key, how="left") df = reduce_mem_usage(df.filter(like="fe_te_")) print(df.info()) dfs.append(df) self.dump(dfs)
def train( cv_num: int, params: Dict[str, Any], train_set: lgb.Dataset, valid_sets: List[lgb.Dataset], verbose_eval: int, early_stopping_rounds: Optional[int] = None, model_number: Optional[int] = None, ) -> lgb.Booster: config = Config() timer_name: str = f"train CV_{cv_num}" if model_number: timer_name += f"_{model_number}" with timer(timer_name, mlflow_on=True): model = lgb.train( params, train_set, num_boost_round=config.num_boost_round, verbose_eval=verbose_eval, # early_stopping_rounds=early_stopping_rounds, valid_sets=valid_sets, ) return model
def log_result(cv_num: int, start_time: str, test_pred: pd.DataFrame): config = Config() d_start = config.CV_START_DAYS[cv_num] d_end = config.CV_START_DAYS[cv_num] + 28 save_cols: List[str] = [ "id", "item_id", "dept_id", "cat_id", "store_id", "state_id", "d", config.TARGET, config.TARGET + "_true", ] pickle.dump( test_pred.loc[(test_pred.d >= d_start) & (test_pred.d < d_end), save_cols], open(f"./output/cv/{start_time}/{cv_num}/test_pred.pkl", "wb"), ) pickle.dump( config, open(f"./output/cv/{start_time}/{cv_num}/config.pkl", "wb"), ) test_pred
def train_cls( cv_num: int, params: Dict[str, Any], train_set: lgb.Dataset, valid_sets: List[lgb.Dataset], verbose_eval: int, early_stopping_rounds: Optional[int] = None, model_number: Optional[int] = None, ) -> LGBMClassifier: config = Config() timer_name: str = f"train CV_{cv_num}" if model_number: timer_name += f"_{model_number}" with timer(timer_name, mlflow_on=True): model = LGBMClassifier(**config.lgbm_cls_params) model.fit( train_set.data, train_set.label, categorical_feature=config.lgbm_cat_features, eval_set=[(dataset.data, dataset.label) for dataset in valid_sets], eval_metric="logloss,auc,cross_entropy", verbose=10, ) return model
def partial_train_and_predict( sp: Split, ids: pd.Series, cv_num: int, model_number: int, objective: Optional[str] = None, SEED: Optional[int] = None, ) -> pd.DataFrame: config = Config() sp_part: Split = Split() sp_part.train = sp.train[sp.train["id"].isin(ids)] sp_part.test = sp.test[sp.test["id"].isin(ids)] train_set, val_set = convert_to_lgb_dataset(sp_part, cv_num) params = config.lgbm_params if objective: params["objective"] = objective params.pop("tweedie_variance_power", None) if SEED: params["seed"] = SEED model = train( cv_num, params, train_set, [train_set], 10, 20, model_number=model_number, ) test_pred = predict(cv_num, sp_part, model) return test_pred
def run(self): config = Config() run_name = get_run_name() splits: List[Split] = self.load("splits") raw: RawData = self.load("raw") splits = delete_unused_features(splits) if config.DROP_OUTLIERS: splits = drop_outliers(splits) print_nan_ratio(splits) for SEED in range(1, 1000, 10): run_name = "seed = {}".format(SEED) experiment_id = start_mlflow() mlflow.start_run(experiment_id=experiment_id, run_name=run_name) timestamp = mlflow.active_run().info.start_time / 1000 start_time = datetime.datetime.fromtimestamp(timestamp).strftime( "%Y-%m-%d_%H:%M:%S") log_params() # wrmsses, rmss, maes = [], [], [] for cv_num, sp in enumerate(splits): Path(f"./output/cv/{start_time}/{cv_num}").mkdir(parents=True, exist_ok=True) test_pred: pd.DataFrame = pd.DataFrame() if config.MODEL == "zero": test_pred = train_by_zero(raw, sp, cv_num) elif config.MODEL == "store": test_pred = train_by_store(raw, sp, cv_num, SEED) elif config.MODEL == "store": test_pred = train_by_store(raw, sp, cv_num) elif config.MODEL == "cat": test_pred = train_by_cat(raw, sp, cv_num) elif config.MODEL == "dept": test_pred = train_by_dept(raw, sp, cv_num) elif config.MODEL == "normal": train_set, val_set = convert_to_lgb_dataset(sp, cv_num) model = train( cv_num, config.lgbm_params, train_set, [train_set], verbose_eval=10, early_stopping_rounds=20, ) test_pred = predict(cv_num, sp, model) if config.CLS_POSTPROCESSING: wrmsse, rmse, mae = log_metrics(cv_num, start_time, raw, test_pred, sp.test) test_pred = cls_postprocessing(cv_num, test_pred) log_result(cv_num, start_time, test_pred) # wrmsse, rmse, mae = log_metrics(cv_num, start_time, raw, test_pred, sp.test) # wrmsses.append(wrmsse) # rmses.append(rmse) # maes.append(mae) # log_avg_metrics(wrmsses, rmses, maes) mlflow.end_run() time.sleep(10)