Beispiel #1
0
def main():
    """Train pipeline"""
    model_data = pd.read_csv(FEATURES_DATA)

    print("\tSplitting train and validation data")
    x_train, x_val, y_train, y_val = train_test_split(
        model_data[FEATURE_COLS],
        model_data[TARGET_COL],
        test_size=0.2,
    )

    print("\tTrain model")
    clf = lgb.LGBMClassifier(
        num_leaves=NUM_LEAVES,
        learning_rate=LR,
        n_estimators=N_ESTIMATORS,
    )
    clf.fit(x_train, y_train)
    compute_log_metrics(clf, x_val, y_val)

    print("\tComputing metrics")
    selected = np.random.choice(model_data.shape[0], size=1000, replace=False)
    features = model_data[FEATURE_COLS].iloc[selected]
    inference = clf.predict_proba(features)[:, 1]

    ModelMonitoringService.export_text(
        features=features.iteritems(),
        inference=inference.tolist(),
    )

    print("\tSaving model")
    with open("/artefact/" + OUTPUT_MODEL_NAME, "wb") as model_file:
        pickle.dump(clf, model_file)
Beispiel #2
0
def trainer(execution_date):
    """Entry point to perform training."""
    print("\nLoad train data")
    data = load_data(TMP_BUCKET + "credit_train/train.csv")
    data = data.fillna(0)
    print("  Train data shape:", data.shape)

    feature_cols = get_feats_to_use()
    train, valid = train_test_split(data, test_size=0.2, random_state=0)
    x_train = train[feature_cols]
    y_train = train[TARGET].values
    x_valid = valid[feature_cols]
    y_valid = valid[TARGET].values

    print("\nTrain model")
    start = time.time()
    clf = get_model()
    clf.fit(x_train,
            y_train,
            eval_set=[(x_train, y_train), (x_valid, y_valid)],
            eval_metric="auc",
            verbose=200,
            early_stopping_rounds=200)
    print("  Time taken = {:.0f} s".format(time.time() - start))

    print("\nEvaluate")
    compute_log_metrics(clf, x_valid, y_valid)

    print("\nLog model monitoring metrics")
    start = time.time()
    selected = np.random.choice(x_train.shape[0], size=2000, replace=False)
    features = x_train.iloc[selected]
    inference = clf.predict_proba(features)[:, 1]

    ModelMonitoringService.export_text(
        features=features.iteritems(),
        inference=inference.tolist(),
    )
    print("  Time taken = {:.0f} s".format(time.time() - start))

    print("\nSave model")
    with open(OUTPUT_MODEL_PATH, "wb") as model_file:
        pickle.dump(clf, model_file)

    # Save feature names
    with open(FEATURE_COLS_PATH, "wb") as file:
        pickle.dump(feature_cols, file)

    # To simulate redis, save to artefact
    from shutil import copyfile
    copyfile("data/test.gz.parquet", "/artefact/test.gz.parquet")
def main():
    bunch = load_diabetes()
    X_train, X_test, Y_train, Y_test = train_test_split(
        bunch.data, bunch.target)
    model = LinearRegression()
    model.fit(X_train, Y_train)

    print("Score: %.2f" % model.score(X_test, Y_test))
    with open("./model.pkl", "wb") as f:
        pickle.dump(model, f)

    features = zip(*[bunch.feature_names, X_train.T])
    # features = [("age", [33, 23, 54, ...]), ("sex", [0, 1, 0]), ...]
    ModelMonitoringService.export_text(
        features=features,
        path="./histogram.prom",
    )
Beispiel #4
0
def main():
    # Extraneous columns (as might be determined through feature selection)
    drop_cols = ['ID']

    # Load into Dataframes
    # x_<name> : features
    # y_<name> : labels
    x_train, y_train = utils.load_dataset(os.path.join(
        'data', 'creditdata', 'creditdata_train_v2.csv'),
                                          drop_columns=drop_cols)
    x_test, y_test = utils.load_dataset(os.path.join('data', 'creditdata',
                                                     'creditdata_test_v2.csv'),
                                        drop_columns=drop_cols)

    # MODEL 1: LOGISTIC REGRESSION
    # Use best parameters from a model selection and threshold tuning process
    best_regularizer = 1e-1
    best_th = 0.43
    model = utils.train_log_reg_model(x_train,
                                      y_train,
                                      seed=0,
                                      C=best_regularizer,
                                      upsample=True,
                                      verbose=True)
    model_name = "logreg_model"
    model_type = ModelTypes.LINEAR

    # TODO - Optional: Uncomment this later
    # MODEL 2: RANDOM FOREST
    # Uses default threshold of 0.5 and model parameters
    # best_th = 0.5
    # model = utils.train_rf_model(x_train, y_train, seed=0, upsample=True, verbose=True)
    # model_name = "randomforest_model"
    # model_type = ModelTypes.TREE

    # If model is in an sklearn pipeline, extract it
    (
        shap_values,
        base_shap_values,
        global_explainability,
        fairness_metrics,
    ) = compute_log_metrics(model=model,
                            x_train=x_train,
                            x_test=x_test,
                            y_test=y_test,
                            best_th=best_th,
                            model_name=model_name,
                            model_type=model_type)

    # TODO - Save the model artefact by filling in the blanks
    # So that the model is viewable on the Bedrock UI
    with open(OUTPUT_MODEL_PATH, "wb") as model_file:
        pickle.dump(model, model_file)

    # IMPORTANT: LOG TRAINING MODEL ON UI to compare to DEPLOYED MODEL
    train_prob = model.predict_proba(x_train)[:, 1]
    train_pred = np.where(train_prob > best_th, 1, 0)

    # Add the Model Monitoring Service and export the metrics
    ModelMonitoringService.export_text(
        features=x_train.iteritems(),
        inference=train_pred.tolist(),
    )

    print("Done!")
Beispiel #5
0
def init_background_threads():
    """Global objects with daemon threads will be stopped by gunicorn --preload flag.
    So instantiate them here instead.
    """
    # Initialise the Bedrock Model Monitoring Service
    current_app.monitor = ModelMonitoringService()
Beispiel #6
0
import pickle

from bedrock_client.bedrock.metrics.service import ModelMonitoringService
from bedrock_client.bedrock.metrics.collector import BaselineMetricCollector
from flask import Flask, request

with open("./model.pkl", "rb") as f:
    model = pickle.load(f)

monitor = ModelMonitoringService(baseline_collector=BaselineMetricCollector(
    path="./histogram.prom"))

app = Flask(__name__)


@app.route("/", methods=["POST"])
def predict():
    features = request.json
    score = model.predict([features])[0]
    pid = monitor.log_prediction(
        request_body=request.data,
        features=features,
        output=score,
    )
    return {"result": score, "prediction_id": pid}


@app.route("/metrics", methods=["GET"])
def metrics():
    return monitor.export_http()[0]
Beispiel #7
0
def main():
    # Extraneous columns (as might be determined through feature selection)
    drop_cols = []

    # Load into Dataframes
    # x_<name> : features
    # y_<name> : labels
    x_train, y_train = utils.load_dataset(os.path.join('data',
                                                       'abalone_train.csv'),
                                          target='Type',
                                          drop_columns=drop_cols)
    x_test, y_test = utils.load_dataset(os.path.join('data',
                                                     'abalone_test.csv'),
                                        target='Type',
                                        drop_columns=drop_cols)

    # for testing only
    x_train["large_ring"] = (x_train["Rings"] > 10).astype(int)
    x_test["large_ring"] = (x_test["Rings"] > 10).astype(int)

    # MODEL 1: Baseline model
    # Use best parameters from a model selection and threshold tuning process
    best_regularizer = 1e-1
    best_th = 0.43
    model = utils.train_log_reg_model(x_train,
                                      y_train,
                                      seed=0,
                                      C=best_regularizer,
                                      upsample=False,
                                      verbose=True)
    model_name = "logreg_model"
    model_type = ModelTypes.LINEAR

    # TODO - Optional: Switch to random forest model
    # # MODEL 2: RANDOM FOREST
    # # Uses default threshold of 0.5 and model parameters
    # best_th = 0.5
    # model = utils.train_rf_model(x_train, y_train, seed=0, upsample=True, verbose=True)
    # model_name = "randomforest_model"
    # model_type = ModelTypes.TREE

    # # TODO - Optional: Switch to catboost model
    # # MODEL 3: CATBOOST
    # # Uses default threshold of 0.5 and model parameters
    # best_th = 0.5
    # model = utils.train_catboost_model(x_train, y_train, seed=0, upsample=True, verbose=True)
    # model_name = "catboost_model"
    # model_type = ModelTypes.TREE

    # Compute explainability and fairness metrics
    # TODO - Optional: can you find a way to save these outputs as artefacts in pickle form?
    (
        shap_values,
        base_shap_values,
        global_explainability,
        fairness_metrics,
    ) = compute_log_metrics(model=model,
                            x_train=x_train,
                            x_test=x_test,
                            y_test=y_test,
                            best_th=best_th,
                            model_name=model_name,
                            model_type=model_type)

    # TODO - Save the model artefact! by filling in the blanks
    # So that the model is viewable on the Bedrock UI
    # Hint: fill in the file path that has been defined as a constant above
    with open(OUTPUT_MODEL_PATH, "wb") as model_file:
        pickle.dump(model, model_file)

    # IMPORTANT: LOG TRAINING MODEL ON UI to compare to DEPLOYED MODEL
    #train_prob = model.predict_proba(x_train)[:, 1]
    train_pred = model.predict(x_train)

    # Add the Model Monitoring Service and export the metrics
    ModelMonitoringService.export_text(
        features=x_train.iteritems(),
        inference=train_pred.tolist(),
    )

    print("Done!")