def main(): """Train pipeline""" model_data = pd.read_csv(FEATURES_DATA) print("\tSplitting train and validation data") x_train, x_val, y_train, y_val = train_test_split( model_data[FEATURE_COLS], model_data[TARGET_COL], test_size=0.2, ) print("\tTrain model") clf = lgb.LGBMClassifier( num_leaves=NUM_LEAVES, learning_rate=LR, n_estimators=N_ESTIMATORS, ) clf.fit(x_train, y_train) compute_log_metrics(clf, x_val, y_val) print("\tComputing metrics") selected = np.random.choice(model_data.shape[0], size=1000, replace=False) features = model_data[FEATURE_COLS].iloc[selected] inference = clf.predict_proba(features)[:, 1] ModelMonitoringService.export_text( features=features.iteritems(), inference=inference.tolist(), ) print("\tSaving model") with open("/artefact/" + OUTPUT_MODEL_NAME, "wb") as model_file: pickle.dump(clf, model_file)
def trainer(execution_date): """Entry point to perform training.""" print("\nLoad train data") data = load_data(TMP_BUCKET + "credit_train/train.csv") data = data.fillna(0) print(" Train data shape:", data.shape) feature_cols = get_feats_to_use() train, valid = train_test_split(data, test_size=0.2, random_state=0) x_train = train[feature_cols] y_train = train[TARGET].values x_valid = valid[feature_cols] y_valid = valid[TARGET].values print("\nTrain model") start = time.time() clf = get_model() clf.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_valid, y_valid)], eval_metric="auc", verbose=200, early_stopping_rounds=200) print(" Time taken = {:.0f} s".format(time.time() - start)) print("\nEvaluate") compute_log_metrics(clf, x_valid, y_valid) print("\nLog model monitoring metrics") start = time.time() selected = np.random.choice(x_train.shape[0], size=2000, replace=False) features = x_train.iloc[selected] inference = clf.predict_proba(features)[:, 1] ModelMonitoringService.export_text( features=features.iteritems(), inference=inference.tolist(), ) print(" Time taken = {:.0f} s".format(time.time() - start)) print("\nSave model") with open(OUTPUT_MODEL_PATH, "wb") as model_file: pickle.dump(clf, model_file) # Save feature names with open(FEATURE_COLS_PATH, "wb") as file: pickle.dump(feature_cols, file) # To simulate redis, save to artefact from shutil import copyfile copyfile("data/test.gz.parquet", "/artefact/test.gz.parquet")
def main(): bunch = load_diabetes() X_train, X_test, Y_train, Y_test = train_test_split( bunch.data, bunch.target) model = LinearRegression() model.fit(X_train, Y_train) print("Score: %.2f" % model.score(X_test, Y_test)) with open("./model.pkl", "wb") as f: pickle.dump(model, f) features = zip(*[bunch.feature_names, X_train.T]) # features = [("age", [33, 23, 54, ...]), ("sex", [0, 1, 0]), ...] ModelMonitoringService.export_text( features=features, path="./histogram.prom", )
def main(): # Extraneous columns (as might be determined through feature selection) drop_cols = ['ID'] # Load into Dataframes # x_<name> : features # y_<name> : labels x_train, y_train = utils.load_dataset(os.path.join( 'data', 'creditdata', 'creditdata_train_v2.csv'), drop_columns=drop_cols) x_test, y_test = utils.load_dataset(os.path.join('data', 'creditdata', 'creditdata_test_v2.csv'), drop_columns=drop_cols) # MODEL 1: LOGISTIC REGRESSION # Use best parameters from a model selection and threshold tuning process best_regularizer = 1e-1 best_th = 0.43 model = utils.train_log_reg_model(x_train, y_train, seed=0, C=best_regularizer, upsample=True, verbose=True) model_name = "logreg_model" model_type = ModelTypes.LINEAR # TODO - Optional: Uncomment this later # MODEL 2: RANDOM FOREST # Uses default threshold of 0.5 and model parameters # best_th = 0.5 # model = utils.train_rf_model(x_train, y_train, seed=0, upsample=True, verbose=True) # model_name = "randomforest_model" # model_type = ModelTypes.TREE # If model is in an sklearn pipeline, extract it ( shap_values, base_shap_values, global_explainability, fairness_metrics, ) = compute_log_metrics(model=model, x_train=x_train, x_test=x_test, y_test=y_test, best_th=best_th, model_name=model_name, model_type=model_type) # TODO - Save the model artefact by filling in the blanks # So that the model is viewable on the Bedrock UI with open(OUTPUT_MODEL_PATH, "wb") as model_file: pickle.dump(model, model_file) # IMPORTANT: LOG TRAINING MODEL ON UI to compare to DEPLOYED MODEL train_prob = model.predict_proba(x_train)[:, 1] train_pred = np.where(train_prob > best_th, 1, 0) # Add the Model Monitoring Service and export the metrics ModelMonitoringService.export_text( features=x_train.iteritems(), inference=train_pred.tolist(), ) print("Done!")
def init_background_threads(): """Global objects with daemon threads will be stopped by gunicorn --preload flag. So instantiate them here instead. """ # Initialise the Bedrock Model Monitoring Service current_app.monitor = ModelMonitoringService()
import pickle from bedrock_client.bedrock.metrics.service import ModelMonitoringService from bedrock_client.bedrock.metrics.collector import BaselineMetricCollector from flask import Flask, request with open("./model.pkl", "rb") as f: model = pickle.load(f) monitor = ModelMonitoringService(baseline_collector=BaselineMetricCollector( path="./histogram.prom")) app = Flask(__name__) @app.route("/", methods=["POST"]) def predict(): features = request.json score = model.predict([features])[0] pid = monitor.log_prediction( request_body=request.data, features=features, output=score, ) return {"result": score, "prediction_id": pid} @app.route("/metrics", methods=["GET"]) def metrics(): return monitor.export_http()[0]
def main(): # Extraneous columns (as might be determined through feature selection) drop_cols = [] # Load into Dataframes # x_<name> : features # y_<name> : labels x_train, y_train = utils.load_dataset(os.path.join('data', 'abalone_train.csv'), target='Type', drop_columns=drop_cols) x_test, y_test = utils.load_dataset(os.path.join('data', 'abalone_test.csv'), target='Type', drop_columns=drop_cols) # for testing only x_train["large_ring"] = (x_train["Rings"] > 10).astype(int) x_test["large_ring"] = (x_test["Rings"] > 10).astype(int) # MODEL 1: Baseline model # Use best parameters from a model selection and threshold tuning process best_regularizer = 1e-1 best_th = 0.43 model = utils.train_log_reg_model(x_train, y_train, seed=0, C=best_regularizer, upsample=False, verbose=True) model_name = "logreg_model" model_type = ModelTypes.LINEAR # TODO - Optional: Switch to random forest model # # MODEL 2: RANDOM FOREST # # Uses default threshold of 0.5 and model parameters # best_th = 0.5 # model = utils.train_rf_model(x_train, y_train, seed=0, upsample=True, verbose=True) # model_name = "randomforest_model" # model_type = ModelTypes.TREE # # TODO - Optional: Switch to catboost model # # MODEL 3: CATBOOST # # Uses default threshold of 0.5 and model parameters # best_th = 0.5 # model = utils.train_catboost_model(x_train, y_train, seed=0, upsample=True, verbose=True) # model_name = "catboost_model" # model_type = ModelTypes.TREE # Compute explainability and fairness metrics # TODO - Optional: can you find a way to save these outputs as artefacts in pickle form? ( shap_values, base_shap_values, global_explainability, fairness_metrics, ) = compute_log_metrics(model=model, x_train=x_train, x_test=x_test, y_test=y_test, best_th=best_th, model_name=model_name, model_type=model_type) # TODO - Save the model artefact! by filling in the blanks # So that the model is viewable on the Bedrock UI # Hint: fill in the file path that has been defined as a constant above with open(OUTPUT_MODEL_PATH, "wb") as model_file: pickle.dump(model, model_file) # IMPORTANT: LOG TRAINING MODEL ON UI to compare to DEPLOYED MODEL #train_prob = model.predict_proba(x_train)[:, 1] train_pred = model.predict(x_train) # Add the Model Monitoring Service and export the metrics ModelMonitoringService.export_text( features=x_train.iteritems(), inference=train_pred.tolist(), ) print("Done!")