def score(data_conf, model_conf, **kwargs): model = joblib.load('artifacts/input/model.joblib') # For demo purposes we read data from Vantage, but in a real environment # it can be anything that pyspark can read (csv, parquet, avro, etc...) create_context(host=os.environ["AOA_CONN_HOST"], username=os.environ["AOA_CONN_USERNAME"], password=os.environ["AOA_CONN_PASSWORD"], database=data_conf["schema"] if "schema" in data_conf and data_conf["schema"] != "" else None) predict_df = DataFrame(data_conf["table"]) # convert to pandas to use locally predict_df = predict_df.to_pandas() # do feature eng in spark / joins whatever reason you're using pyspark... print("Scoring") y_pred = model.predict(predict_df[model.feature_names]) print("Finished Scoring") # create result dataframe y_pred = pd.DataFrame(y_pred, columns=["pred"]) # wrap as pyspark df predictions = spark.createDataFrame(y_pred) # in a real world you would write the results back to HDFS, Teradata, S3 etc. predictions.write.mode("overwrite").save("/tmp/predictions") logging.info("Finished Saving Scoring")
def score(data_conf, model_conf, **kwargs): model = joblib.load("artifacts/input/model.joblib") create_context(host=os.environ["AOA_CONN_HOST"], username=os.environ["AOA_CONN_USERNAME"], password=os.environ["AOA_CONN_PASSWORD"], database=data_conf["schema"] if "schema" in data_conf and data_conf["schema"] != "" else None) predict_df = DataFrame(data_conf["table"]) # convert to pandas to use locally predict_df = predict_df.to_pandas() print("Scoring") y_pred = model.predict(predict_df[model.feature_names]) print("Finished Scoring") # create result dataframe and store in Teradata y_pred = pd.DataFrame(y_pred, columns=["pred"]) y_pred["PatientId"] = predict_df["PatientId"].values copy_to_sql(df=y_pred, table_name=data_conf["predictions"], index=False, if_exists="replace")
def evaluate(data_conf, model_conf, **kwargs): model = joblib.load('artifacts/input/model.joblib') create_context(host=os.environ["AOA_CONN_HOST"], username=os.environ["AOA_CONN_USERNAME"], password=os.environ["AOA_CONN_PASSWORD"], database=data_conf["schema"] if "schema" in data_conf and data_conf["schema"] != "" else None) # Read test dataset from Teradata # As this is for demo purposes, we simulate the test dataset changing between executions # by introducing a random sample. Note that the sampling is performed in Teradata! test_df = DataFrame(data_conf["table"]).sample(frac=0.8) test_pdf = test_df.to_pandas() X_test = test_pdf[model.feature_names] y_test = test_pdf[model.target_name] print("Scoring") y_pred = model.predict(test_pdf[model.feature_names]) evaluation = { 'Accuracy': '{:.2f}'.format(metrics.accuracy_score(y_test, y_pred)), 'Recall': '{:.2f}'.format(metrics.recall_score(y_test, y_pred)), 'Precision': '{:.2f}'.format(metrics.precision_score(y_test, y_pred)), 'f1-score': '{:.2f}'.format(metrics.f1_score(y_test, y_pred)) } with open("artifacts/output/metrics.json", "w+") as f: json.dump(evaluation, f) metrics.plot_confusion_matrix(model, X_test, y_test) save_plot('Confusion Matrix') metrics.plot_roc_curve(model, X_test, y_test) save_plot('ROC Curve') # xgboost has its own feature importance plot support but lets use shap as explainability example import shap shap_explainer = shap.TreeExplainer(model['xgb']) shap_values = shap_explainer.shap_values(X_test) shap.summary_plot(shap_values, X_test, feature_names=model.feature_names, show=False, plot_size=(12, 8), plot_type='bar') save_plot('SHAP Feature Importance') feature_importance = pd.DataFrame(list(zip(model.feature_names, np.abs(shap_values).mean(0))), columns=['col_name', 'feature_importance_vals']) feature_importance = feature_importance.set_index("col_name").T.to_dict(orient='records')[0] stats.record_stats(test_df, features=model.feature_names, predictors=["HasDiabetes"], categorical=["HasDiabetes"], importance=feature_importance, category_labels={"HasDiabetes": {0: "false", 1: "true"}})
def train(data_conf, model_conf, **kwargs): hyperparams = model_conf["hyperParameters"] create_context(host=os.environ["AOA_CONN_HOST"], username=os.environ["AOA_CONN_USERNAME"], password=os.environ["AOA_CONN_PASSWORD"], database=data_conf["schema"] if "schema" in data_conf and data_conf["schema"] != "" else None) feature_names = ["NumTimesPrg", "PlGlcConc", "BloodP", "SkinThick", "TwoHourSerIns", "BMI", "DiPedFunc", "Age"] target_name = "HasDiabetes" # read training dataset from Teradata and convert to pandas train_df = DataFrame(data_conf["table"]) train_df = train_df.select([feature_names + [target_name]]) train_pdf = train_df.to_pandas() # split data into X and y X_train = train_pdf.drop(target_name, 1) y_train = train_pdf[target_name] print("Starting training...") # fit model to training data model = Pipeline([('scaler', MinMaxScaler()), ('xgb', XGBClassifier(eta=hyperparams["eta"], max_depth=hyperparams["max_depth"]))]) # xgboost saves feature names but lets store on pipeline for easy access later model.feature_names = feature_names model.target_name = target_name model.fit(X_train, y_train) print("Finished training") # export model artefacts joblib.dump(model, "artifacts/output/model.joblib") # we can also save as pmml so it can be used for In-Vantage scoring etc. xgboost_to_pmml(pipeline=model, col_names=feature_names, target_name=target_name, pmml_f_name="artifacts/output/model.pmml") print("Saved trained model") from xgboost import plot_importance model["xgb"].get_booster().feature_names = feature_names plot_importance(model["xgb"].get_booster(), max_num_features=10) save_plot("feature_importance.png") feature_importance = model["xgb"].get_booster().get_score(importance_type="weight") stats.record_stats(train_df, features=feature_names, predictors=["HasDiabetes"], categorical=["HasDiabetes"], importance=feature_importance, category_labels={"HasDiabetes": {0: "false", 1: "true"}})
def evaluate(data_conf, model_conf, **kwargs): model = joblib.load('artifacts/input/model.joblib') create_context(host=os.environ["AOA_CONN_HOST"], username=os.environ["AOA_CONN_USERNAME"], password=os.environ["AOA_CONN_PASSWORD"]) # Read test dataset from Teradata # As this is for demo purposes, we simulate the test dataset changing between executions # by introducing a random sample. Note that the sampling is performed in Teradata! test_df = DataFrame(data_conf["table"]).sample(frac=0.8) test_df = test_df.to_pandas() X_test = test_df[model.feature_names] y_test = test_df[model.target_name] print("Scoring") y_pred = model.predict(test_df[model.feature_names]) evaluation = { 'Accuracy': '{:.2f}'.format(metrics.accuracy_score(y_test, y_pred)), 'Recall': '{:.2f}'.format(metrics.recall_score(y_test, y_pred)), 'Precision': '{:.2f}'.format(metrics.precision_score(y_test, y_pred)), 'f1-score': '{:.2f}'.format(metrics.f1_score(y_test, y_pred)) } with open("artifacts/output/metrics.json", "w+") as f: json.dump(evaluation, f) metrics.plot_confusion_matrix(model, X_test, y_test) save_plot('Confusion Matrix') metrics.plot_roc_curve(model, X_test, y_test) save_plot('ROC Curve') # xgboost has its own feature importance plot support but lets use shap as explainability example import shap shap_explainer = shap.TreeExplainer(model['xgb']) shap_values = shap_explainer.shap_values(X_test) shap.summary_plot(shap_values, X_test, feature_names=model.feature_names, show=False, plot_size=(12, 8), plot_type='bar') save_plot('SHAP Feature Importance')
def train(data_conf, model_conf, **kwargs): hyperparams = model_conf["hyperParameters"] create_context(host=os.environ["AOA_CONN_HOST"], username=os.environ["AOA_CONN_USERNAME"], password=os.environ["AOA_CONN_PASSWORD"]) feature_names = [ "NumTimesPrg", "PlGlcConc", "BloodP", "SkinThick", "TwoHourSerIns", "BMI", "DiPedFunc", "Age" ] target_name = "HasDiabetes" # read training dataset from Teradata and convert to pandas train_df = DataFrame(data_conf["table"]) train_df = train_df.select([feature_names + [target_name]]) train_df = train_df.to_pandas() # split data into X and y X_train = train_df.drop(target_name, 1) y_train = train_df[target_name] print("Starting training...") # fit model to training data model = Pipeline([('scaler', MinMaxScaler()), ('xgb', XGBClassifier(eta=hyperparams["eta"], max_depth=hyperparams["max_depth"]))]) # xgboost saves feature names but lets store on pipeline for easy access later model.feature_names = feature_names model.target_name = target_name model.fit(X_train, y_train) print("Finished training") # export model artefacts joblib.dump(model, "artifacts/output/model.joblib") # we can also save as pmml so it can be used for In-Vantage scoring etc. xgboost_to_pmml(pipeline=model, col_names=feature_names, target_name=target_name, pmml_f_name="artifacts/output/model.pmml") print("Saved trained model")
def evaluate(data_conf, model_conf, **kwargs): model_version = kwargs["model_version"] create_context(host=os.environ["AOA_CONN_HOST"], username=os.environ["AOA_CONN_USERNAME"], password=os.environ["AOA_CONN_PASSWORD"]) def eval_partition(partition): model_artefact = partition.loc[partition['n_row'] == 1, 'model_artefact'].iloc[0] model = dill.loads(base64.b64decode(model_artefact)) X_test = partition[model.features] y_test = partition[['Y1']] y_pred = model.predict(X_test) partition_id = partition.partition_ID.iloc[0] # record whatever partition level information you want like rows, data stats, metrics, explainability, etc partition_metadata = json.dumps({ "num_rows": partition.shape[0], "metrics": { "MAE": "{:.2f}".format(metrics.mean_absolute_error(y_test, y_pred)), "MSE": "{:.2f}".format(metrics.mean_squared_error(y_test, y_pred)), "R2": "{:.2f}".format(metrics.r2_score(y_test, y_pred)) } }) return np.array( [[partition_id, partition.shape[0], partition_metadata]]) # we join the model artefact to the 1st row of the data table so we can load it in the partition query = f""" SELECT d.*, CASE WHEN n_row=1 THEN m.model_artefact ELSE null END AS model_artefact FROM (SELECT x.*, ROW_NUMBER() OVER (PARTITION BY x.partition_id ORDER BY x.partition_id) AS n_row FROM {data_conf["table"]} x) AS d LEFT JOIN aoa_sto_models m ON d.partition_id = m.partition_id WHERE m.model_version = '{model_version}' """ df = DistDataFrame(query=query, dist_mode=DistMode.STO, sto_id="model_eval") eval_df = df.map_partition(lambda partition: eval_partition(partition), partition_by="partition_id", returns=[["partition_id", "VARCHAR(255)"], ["num_rows", "BIGINT"], ["partition_metadata", "CLOB"]]) # materialize as we reuse result eval_df = DataFrame(eval_df._table_name, materialize=True) save_metadata(eval_df) save_evaluation_metrics(eval_df, ["MAE", "MSE", "R2"]) print("Finished evaluation")
def train(data_conf, model_conf, **kwargs): model_version = kwargs["model_version"] hyperparams = model_conf["hyperParameters"] create_context(host=os.environ["AOA_CONN_HOST"], username=os.environ["AOA_CONN_USERNAME"], password=os.environ["AOA_CONN_PASSWORD"], database=data_conf["schema"] if "schema" in data_conf and data_conf["schema"] != "" else None) cleanup_cli(model_version) def train_partition(partition, model_version, hyperparams): numeric_features = ["X" + str(i) for i in range(1, 10)] for i in numeric_features: partition[i] = partition[i].astype("float") numeric_transformer = Pipeline( steps=[("imputer", SimpleImputer( strategy="median")), ("scaler", RobustScaler()), ("pca", PCA(0.95))]) categorical_features = ["flag"] for i in categorical_features: partition[i] = partition[i].astype("category") categorical_transformer = Pipeline( steps=[("imputer", SimpleImputer(strategy="constant", fill_value=0) ), ("onehot", OneHotEncoder(handle_unknown="ignore"))]) preprocessor = ColumnTransformer( transformers=[("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features)]) features = numeric_features + categorical_features pipeline = Pipeline([ ("preprocessor", preprocessor), ("rf", RandomForestRegressor(max_depth=hyperparams["max_depth"])) ]) pipeline.fit(partition[features], partition[['Y1']]) pipeline.features = features partition_id = partition.partition_ID.iloc[0] artefact = base64.b64encode(dill.dumps(pipeline)) # record whatever partition level information you want like rows, data stats, explainability, etc partition_metadata = json.dumps({ "num_rows": partition.shape[0], "hyper_parameters": hyperparams }) return np.array([[ partition_id, model_version, partition.shape[0], partition_metadata, artefact ]]) print("Starting training...") query = "SELECT * FROM {table} WHERE fold_ID='train'".format( table=data_conf["table"]) df = DistDataFrame(query=query, dist_mode=DistMode.STO, sto_id="model_train") model_df = df.map_partition(lambda partition: train_partition( partition, model_version, hyperparams), partition_by="partition_id", returns=[["partition_id", "VARCHAR(255)"], ["model_version", "VARCHAR(255)"], ["num_rows", "BIGINT"], ["partition_metadata", "CLOB"], ["model_artefact", "CLOB"]]) # materialize as we reuse result model_df = DataFrame(model_df._table_name, materialize=True) # append to models table model_df.to_sql("aoa_sto_models", if_exists="append") save_metadata(model_df) print("Finished training")