Beispiel #1
0
def main():
    with mlflow.start_run():
        # mlflow native APIs
        mlflow.log_param("param", 0)
        mlflow.log_metric("metric", 1.0)

        # flatten dict
        mlflow.log_params_flatten({"a": {"b": 0}})
        mlflow.log_metrics_flatten({"a": {"b": 0.0}})

        # dict
        mlflow.log_dict({"a": 0}, "dict.json")

        # numpy array
        mlflow.log_numpy(np.array([0]), "array.npy")

        # pandas dataframe
        mlflow.log_df(pd.DataFrame({"a": [0]}), "df.csv")

        # matplotlib figure
        fig, ax = plt.subplots()
        ax.plot([0, 1], [0, 1])
        mlflow.log_figure(fig, "figure.png")

        # confusion matrix
        mlflow.log_confusion_matrix([[1, 2], [3, 4]])
def main():
    config = {
        "split": {
            "test_size": 0.2,
            "random_state": 42
        },
        "model": {
            "objective": "binary",
            "metric": "auc",
            "seed": 42
        },
        "fit": {
            "num_boost_round": 10,
            "early_stopping_rounds": 3
        },
    }
    # Prepare training data.
    X, y = breast_cancer()
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, **config["split"])
    train_set = lgb.Dataset(X_train, label=y_train)

    # Set experiment.
    expr_name = "lightgbm"
    mlflow.get_or_create_experiment(expr_name)  # EX
    mlflow.set_experiment(expr_name)

    with mlflow.start_run():
        # Log training configuration.
        mlflow.log_params_flatten(config)  # EX
        mlflow.log_dict(config, "config.json")  # EX

        # Train model.
        model = lgb.train(config["model"],
                          train_set,
                          valid_sets=[train_set],
                          valid_names=["train"],
                          **config["fit"])

        # Log feature importance.
        importance_type = "gain"
        features = model.feature_name()
        importances = model.feature_importance(importance_type)
        mlflow.log_feature_importance(features, importances,
                                      importance_type)  # EX

        # Log confusion metrics.
        mlflow.log_metrics_flatten(model.best_score)

        # Log confusion matrix.
        y_proba = model.predict(X_test)
        cm = confusion_matrix(y_test, y_proba > 0.5)
        mlflow.log_confusion_matrix(cm)  # EX
Beispiel #3
0
def main():
    calendar, prices, sales, submission = read_data()
    num_items = sales.shape[0]
    pred_days = submission.shape[1] - 1  # 28

    # encoder calendar and sales cols into numeric
    calendar = encoder_category(
        calendar, ["event_name_1", "event_type_1", "event_name_2", "event_type_2"
                   ]).pipe(reduce_mem_usage, verbose)
    sales = encoder_category(
        sales,
        ["item_id", "dept_id", "cat_id", "store_id", "state_id"],
    ).pipe(reduce_mem_usage, verbose)
    prices = encoder_category(prices,
                              ["item_id", "store_id"]).pipe(reduce_mem_usage, verbose)

    data = reshape_sales(sales, submission, pred_days, d_thresh=1941 -
                         int(365 * 2), verbose=False)  # d_thresh why is this
    del sales

    calendar["d"] = extract_num(calendar["d"])
    data = merge_calendar(data, calendar)
    data = merge_prices(data, prices)
    del calendar, prices
    gc.collect()
    data = reduce_mem_usage(data)
    data = add_demand_features(data, pred_days).pipe(reduce_mem_usage)
    data = add_price_features(data).pipe(reduce_mem_usage)
    dt_col = "date"
    data = add_time_features(data, dt_col).pipe(reduce_mem_usage)
    data = data.sort_values("date")

    print("start date:", data[dt_col].min())
    print("end date:", data[dt_col].max())
    print("data shape:", data.shape)

    # stage 2
    day_col = "d"
    cv_params = {"n_splits": 3, "train_days": int(
        365 * 1.5), "test_days": pred_days, "day_col": day_col, "pred_days": pred_days}
    cv = CustomTimeSeriesSpliter(**cv_params)
    sample = data.iloc[::1000][[day_col, dt_col]].reset_index(drop=True)
    show_cv_days(cv, sample, dt_col, day_col)
    plot_cv_indices(cv, sample, dt_col)
    del sample
    gc.collect()

    features = [
        "item_id",
        "dept_id",
        "cat_id",
        "store_id",
        "state_id",
        "event_name_1",
        "event_type_1",
        "event_name_2",
        "event_type_2",
        "snap_CA",
        "snap_TX",
        "snap_WI",
        "sell_price",
        # demand features
        "shift_t28",
        "shift_t29",
        "shift_t30",
        # std
        "rolling_std_t7",
        "rolling_std_t30",
        "rolling_std_t60",
        "rolling_std_t90",
        "rolling_std_t180",
        # mean
        "rolling_mean_t7",
        "rolling_mean_t30",
        "rolling_mean_t60",
        "rolling_mean_t90",
        "rolling_mean_t180",
        # min
        "rolling_min_t7",
        "rolling_min_t30",
        "rolling_min_t60",
        # max
        "rolling_max_t7",
        "rolling_max_t30",
        "rolling_max_t60",
        # others
        "rolling_skew_t30",
        "rolling_kurt_t30",
        # price features
        "price_change_t1",
        "price_change_t365",
        "rolling_price_std_t7",
        "rolling_price_std_t30",
        # time features
        "year",
        "quarter",
        "month",
        "week",
        "day",
        "dayofweek",
        "is_weekend",
    ]
    is_train = data["d"] < 1914

    # Attach "d" to X_train for cross validation.
    x_train = data[is_train][[day_col] + features].reset_index(drop=True)
    y_train = data[is_train]["demand"].reset_index(drop=True)
    x_test = data[~is_train][features].reset_index(drop=True)
    # keep these two columns to use later.
    id_date = data[~is_train][["id", "date"]].reset_index(drop=True)
    del data
    gc.collect()

    print("x_train shape:", x_train.shape)
    print("x_test shape:", x_test.shape)
    bst_params = {"boosting_type": "gbdt", "metric": "rmse",
                  "objective": "regression", "n_jobs": -1, "seed": 42, "learning_rate": 0.1,
                  "bagging_fraction": 0.75, "bagging_freq": 10, "colsample_bytree": 0.75}
    fit_params = {"num_boost_round": 100_000,
                  "early_stopping_rounds": 50, "verbose_eval": 100}
    models = train_lgb(bst_params, fit_params, x_train,
                       y_train, cv, drop_when_train=[day_col])
    del x_train, y_train
    gc.collect()

    imp_type = "gain"
    importances = np.zeros(x_test.shape[1])
    preds = np.zeros(x_test.shape[0])
    for model in models:
        preds += model.predict(x_test)
        importances += model.feature_importance(imp_type)
    preds = preds / cv.get_splits()
    importances = importances / cv.get_splits()

    with mlflow.start_run():
        mlflow.log_params_flatten(
            {"bst": bst_params, "fit": fit_params, "cv": cv_params})

    features = models[0].feature_name()
    fig = mplt.feature_importance(features, importances, imp_type, limit=30)
    plt.show()

    make_submission(id_date.assign(demand=preds), submission, pred_days)
Beispiel #4
0
imp_type = "gain"
importances = np.zeros(X_test.shape[1])
preds = np.zeros(X_test.shape[0])

for model in models:
    preds += model.predict(X_test)
    importances += model.feature_importance(imp_type)

preds = preds / cv.get_n_splits()
importances = importances / cv.get_n_splits()

#%%
from mlflow_extend import mlflow, plotting as mplt

with mlflow.start_run():
    mlflow.log_params_flatten({
        "bst": bst_params,
        "fit": fit_params,
        "cv": cv_params
    })

features = models[0].feature_name()
_ = mplt.feature_importance(features, importances, imp_type, limit=30)

#%%


def make_submission(test, submission):
    preds = test[["id", "date", "demand"]]
    preds = preds.pivot(index="id", columns="date",