def main(): with mlflow.start_run(): # mlflow native APIs mlflow.log_param("param", 0) mlflow.log_metric("metric", 1.0) # flatten dict mlflow.log_params_flatten({"a": {"b": 0}}) mlflow.log_metrics_flatten({"a": {"b": 0.0}}) # dict mlflow.log_dict({"a": 0}, "dict.json") # numpy array mlflow.log_numpy(np.array([0]), "array.npy") # pandas dataframe mlflow.log_df(pd.DataFrame({"a": [0]}), "df.csv") # matplotlib figure fig, ax = plt.subplots() ax.plot([0, 1], [0, 1]) mlflow.log_figure(fig, "figure.png") # confusion matrix mlflow.log_confusion_matrix([[1, 2], [3, 4]])
def main(): config = { "split": { "test_size": 0.2, "random_state": 42 }, "model": { "objective": "binary", "metric": "auc", "seed": 42 }, "fit": { "num_boost_round": 10, "early_stopping_rounds": 3 }, } # Prepare training data. X, y = breast_cancer() X_train, X_test, y_train, y_test = train_test_split( X, y, **config["split"]) train_set = lgb.Dataset(X_train, label=y_train) # Set experiment. expr_name = "lightgbm" mlflow.get_or_create_experiment(expr_name) # EX mlflow.set_experiment(expr_name) with mlflow.start_run(): # Log training configuration. mlflow.log_params_flatten(config) # EX mlflow.log_dict(config, "config.json") # EX # Train model. model = lgb.train(config["model"], train_set, valid_sets=[train_set], valid_names=["train"], **config["fit"]) # Log feature importance. importance_type = "gain" features = model.feature_name() importances = model.feature_importance(importance_type) mlflow.log_feature_importance(features, importances, importance_type) # EX # Log confusion metrics. mlflow.log_metrics_flatten(model.best_score) # Log confusion matrix. y_proba = model.predict(X_test) cm = confusion_matrix(y_test, y_proba > 0.5) mlflow.log_confusion_matrix(cm) # EX
def main(): calendar, prices, sales, submission = read_data() num_items = sales.shape[0] pred_days = submission.shape[1] - 1 # 28 # encoder calendar and sales cols into numeric calendar = encoder_category( calendar, ["event_name_1", "event_type_1", "event_name_2", "event_type_2" ]).pipe(reduce_mem_usage, verbose) sales = encoder_category( sales, ["item_id", "dept_id", "cat_id", "store_id", "state_id"], ).pipe(reduce_mem_usage, verbose) prices = encoder_category(prices, ["item_id", "store_id"]).pipe(reduce_mem_usage, verbose) data = reshape_sales(sales, submission, pred_days, d_thresh=1941 - int(365 * 2), verbose=False) # d_thresh why is this del sales calendar["d"] = extract_num(calendar["d"]) data = merge_calendar(data, calendar) data = merge_prices(data, prices) del calendar, prices gc.collect() data = reduce_mem_usage(data) data = add_demand_features(data, pred_days).pipe(reduce_mem_usage) data = add_price_features(data).pipe(reduce_mem_usage) dt_col = "date" data = add_time_features(data, dt_col).pipe(reduce_mem_usage) data = data.sort_values("date") print("start date:", data[dt_col].min()) print("end date:", data[dt_col].max()) print("data shape:", data.shape) # stage 2 day_col = "d" cv_params = {"n_splits": 3, "train_days": int( 365 * 1.5), "test_days": pred_days, "day_col": day_col, "pred_days": pred_days} cv = CustomTimeSeriesSpliter(**cv_params) sample = data.iloc[::1000][[day_col, dt_col]].reset_index(drop=True) show_cv_days(cv, sample, dt_col, day_col) plot_cv_indices(cv, sample, dt_col) del sample gc.collect() features = [ "item_id", "dept_id", "cat_id", "store_id", "state_id", "event_name_1", "event_type_1", "event_name_2", "event_type_2", "snap_CA", "snap_TX", "snap_WI", "sell_price", # demand features "shift_t28", "shift_t29", "shift_t30", # std "rolling_std_t7", "rolling_std_t30", "rolling_std_t60", "rolling_std_t90", "rolling_std_t180", # mean "rolling_mean_t7", "rolling_mean_t30", "rolling_mean_t60", "rolling_mean_t90", "rolling_mean_t180", # min "rolling_min_t7", "rolling_min_t30", "rolling_min_t60", # max "rolling_max_t7", "rolling_max_t30", "rolling_max_t60", # others "rolling_skew_t30", "rolling_kurt_t30", # price features "price_change_t1", "price_change_t365", "rolling_price_std_t7", "rolling_price_std_t30", # time features "year", "quarter", "month", "week", "day", "dayofweek", "is_weekend", ] is_train = data["d"] < 1914 # Attach "d" to X_train for cross validation. x_train = data[is_train][[day_col] + features].reset_index(drop=True) y_train = data[is_train]["demand"].reset_index(drop=True) x_test = data[~is_train][features].reset_index(drop=True) # keep these two columns to use later. id_date = data[~is_train][["id", "date"]].reset_index(drop=True) del data gc.collect() print("x_train shape:", x_train.shape) print("x_test shape:", x_test.shape) bst_params = {"boosting_type": "gbdt", "metric": "rmse", "objective": "regression", "n_jobs": -1, "seed": 42, "learning_rate": 0.1, "bagging_fraction": 0.75, "bagging_freq": 10, "colsample_bytree": 0.75} fit_params = {"num_boost_round": 100_000, "early_stopping_rounds": 50, "verbose_eval": 100} models = train_lgb(bst_params, fit_params, x_train, y_train, cv, drop_when_train=[day_col]) del x_train, y_train gc.collect() imp_type = "gain" importances = np.zeros(x_test.shape[1]) preds = np.zeros(x_test.shape[0]) for model in models: preds += model.predict(x_test) importances += model.feature_importance(imp_type) preds = preds / cv.get_splits() importances = importances / cv.get_splits() with mlflow.start_run(): mlflow.log_params_flatten( {"bst": bst_params, "fit": fit_params, "cv": cv_params}) features = models[0].feature_name() fig = mplt.feature_importance(features, importances, imp_type, limit=30) plt.show() make_submission(id_date.assign(demand=preds), submission, pred_days)
preds = np.zeros(X_test.shape[0]) for model in models: preds += model.predict(X_test) importances += model.feature_importance(imp_type) preds = preds / cv.get_n_splits() importances = importances / cv.get_n_splits() #%% from mlflow_extend import mlflow, plotting as mplt with mlflow.start_run(): mlflow.log_params_flatten({ "bst": bst_params, "fit": fit_params, "cv": cv_params }) features = models[0].feature_name() _ = mplt.feature_importance(features, importances, imp_type, limit=30) #%% def make_submission(test, submission): preds = test[["id", "date", "demand"]] preds = preds.pivot(index="id", columns="date", values="demand").reset_index() preds.columns = ["id"] + ["F" + str(d + 1) for d in range(DAYS_PRED)]