def score(data_conf, model_conf, **kwargs):
    model = joblib.load('artifacts/input/model.joblib')

    # For demo purposes we read data from Vantage, but in a real environment
    # it can be anything that pyspark can read (csv, parquet, avro, etc...)
    create_context(host=os.environ["AOA_CONN_HOST"],
                   username=os.environ["AOA_CONN_USERNAME"],
                   password=os.environ["AOA_CONN_PASSWORD"],
                   database=data_conf["schema"] if "schema" in data_conf
                   and data_conf["schema"] != "" else None)

    predict_df = DataFrame(data_conf["table"])

    # convert to pandas to use locally
    predict_df = predict_df.to_pandas()

    # do feature eng in spark / joins whatever reason you're using pyspark...

    print("Scoring")
    y_pred = model.predict(predict_df[model.feature_names])

    print("Finished Scoring")

    # create result dataframe
    y_pred = pd.DataFrame(y_pred, columns=["pred"])

    # wrap as pyspark df
    predictions = spark.createDataFrame(y_pred)

    # in a real world you would write the results back to HDFS, Teradata, S3 etc.
    predictions.write.mode("overwrite").save("/tmp/predictions")

    logging.info("Finished Saving Scoring")
Example #2
0
def score(data_conf, model_conf, **kwargs):
    model = joblib.load("artifacts/input/model.joblib")

    create_context(host=os.environ["AOA_CONN_HOST"],
                   username=os.environ["AOA_CONN_USERNAME"],
                   password=os.environ["AOA_CONN_PASSWORD"],
                   database=data_conf["schema"] if "schema" in data_conf
                   and data_conf["schema"] != "" else None)

    predict_df = DataFrame(data_conf["table"])

    # convert to pandas to use locally
    predict_df = predict_df.to_pandas()

    print("Scoring")
    y_pred = model.predict(predict_df[model.feature_names])

    print("Finished Scoring")

    # create result dataframe and store in Teradata
    y_pred = pd.DataFrame(y_pred, columns=["pred"])
    y_pred["PatientId"] = predict_df["PatientId"].values
    copy_to_sql(df=y_pred,
                table_name=data_conf["predictions"],
                index=False,
                if_exists="replace")
Example #3
0
def evaluate(data_conf, model_conf, **kwargs):
    model = joblib.load('artifacts/input/model.joblib')

    create_context(host=os.environ["AOA_CONN_HOST"],
                   username=os.environ["AOA_CONN_USERNAME"],
                   password=os.environ["AOA_CONN_PASSWORD"],
                   database=data_conf["schema"] if "schema" in data_conf and data_conf["schema"] != "" else None)

    # Read test dataset from Teradata
    # As this is for demo purposes, we simulate the test dataset changing between executions
    # by introducing a random sample. Note that the sampling is performed in Teradata!
    test_df = DataFrame(data_conf["table"]).sample(frac=0.8)
    test_pdf = test_df.to_pandas()

    X_test = test_pdf[model.feature_names]
    y_test = test_pdf[model.target_name]

    print("Scoring")
    y_pred = model.predict(test_pdf[model.feature_names])

    evaluation = {
        'Accuracy': '{:.2f}'.format(metrics.accuracy_score(y_test, y_pred)),
        'Recall': '{:.2f}'.format(metrics.recall_score(y_test, y_pred)),
        'Precision': '{:.2f}'.format(metrics.precision_score(y_test, y_pred)),
        'f1-score': '{:.2f}'.format(metrics.f1_score(y_test, y_pred))
    }

    with open("artifacts/output/metrics.json", "w+") as f:
        json.dump(evaluation, f)

    metrics.plot_confusion_matrix(model, X_test, y_test)
    save_plot('Confusion Matrix')

    metrics.plot_roc_curve(model, X_test, y_test)
    save_plot('ROC Curve')

    # xgboost has its own feature importance plot support but lets use shap as explainability example
    import shap

    shap_explainer = shap.TreeExplainer(model['xgb'])
    shap_values = shap_explainer.shap_values(X_test)

    shap.summary_plot(shap_values, X_test, feature_names=model.feature_names,
                      show=False, plot_size=(12, 8), plot_type='bar')
    save_plot('SHAP Feature Importance')

    feature_importance = pd.DataFrame(list(zip(model.feature_names, np.abs(shap_values).mean(0))),
                                      columns=['col_name', 'feature_importance_vals'])
    feature_importance = feature_importance.set_index("col_name").T.to_dict(orient='records')[0]

    stats.record_stats(test_df,
                       features=model.feature_names,
                       predictors=["HasDiabetes"],
                       categorical=["HasDiabetes"],
                       importance=feature_importance,
                       category_labels={"HasDiabetes": {0: "false", 1: "true"}})
Example #4
0
def train(data_conf, model_conf, **kwargs):
    hyperparams = model_conf["hyperParameters"]

    create_context(host=os.environ["AOA_CONN_HOST"],
                   username=os.environ["AOA_CONN_USERNAME"],
                   password=os.environ["AOA_CONN_PASSWORD"],
                   database=data_conf["schema"] if "schema" in data_conf and data_conf["schema"] != "" else None)

    feature_names = ["NumTimesPrg", "PlGlcConc", "BloodP", "SkinThick", "TwoHourSerIns", "BMI", "DiPedFunc", "Age"]
    target_name = "HasDiabetes"

    # read training dataset from Teradata and convert to pandas
    train_df = DataFrame(data_conf["table"])
    train_df = train_df.select([feature_names + [target_name]])
    train_pdf = train_df.to_pandas()

    # split data into X and y
    X_train = train_pdf.drop(target_name, 1)
    y_train = train_pdf[target_name]

    print("Starting training...")

    # fit model to training data
    model = Pipeline([('scaler', MinMaxScaler()),
                      ('xgb', XGBClassifier(eta=hyperparams["eta"],
                                            max_depth=hyperparams["max_depth"]))])
    # xgboost saves feature names but lets store on pipeline for easy access later
    model.feature_names = feature_names
    model.target_name = target_name

    model.fit(X_train, y_train)

    print("Finished training")

    # export model artefacts
    joblib.dump(model, "artifacts/output/model.joblib")

    # we can also save as pmml so it can be used for In-Vantage scoring etc.
    xgboost_to_pmml(pipeline=model, col_names=feature_names, target_name=target_name,
                    pmml_f_name="artifacts/output/model.pmml")

    print("Saved trained model")

    from xgboost import plot_importance
    model["xgb"].get_booster().feature_names = feature_names
    plot_importance(model["xgb"].get_booster(), max_num_features=10)
    save_plot("feature_importance.png")

    feature_importance = model["xgb"].get_booster().get_score(importance_type="weight")
    stats.record_stats(train_df,
                       features=feature_names,
                       predictors=["HasDiabetes"],
                       categorical=["HasDiabetes"],
                       importance=feature_importance,
                       category_labels={"HasDiabetes": {0: "false", 1: "true"}})
Example #5
0
def evaluate(data_conf, model_conf, **kwargs):
    model = joblib.load('artifacts/input/model.joblib')

    create_context(host=os.environ["AOA_CONN_HOST"],
                   username=os.environ["AOA_CONN_USERNAME"],
                   password=os.environ["AOA_CONN_PASSWORD"])

    # Read test dataset from Teradata
    # As this is for demo purposes, we simulate the test dataset changing between executions
    # by introducing a random sample. Note that the sampling is performed in Teradata!
    test_df = DataFrame(data_conf["table"]).sample(frac=0.8)
    test_df = test_df.to_pandas()

    X_test = test_df[model.feature_names]
    y_test = test_df[model.target_name]

    print("Scoring")
    y_pred = model.predict(test_df[model.feature_names])

    evaluation = {
        'Accuracy': '{:.2f}'.format(metrics.accuracy_score(y_test, y_pred)),
        'Recall': '{:.2f}'.format(metrics.recall_score(y_test, y_pred)),
        'Precision': '{:.2f}'.format(metrics.precision_score(y_test, y_pred)),
        'f1-score': '{:.2f}'.format(metrics.f1_score(y_test, y_pred))
    }

    with open("artifacts/output/metrics.json", "w+") as f:
        json.dump(evaluation, f)

    metrics.plot_confusion_matrix(model, X_test, y_test)
    save_plot('Confusion Matrix')

    metrics.plot_roc_curve(model, X_test, y_test)
    save_plot('ROC Curve')

    # xgboost has its own feature importance plot support but lets use shap as explainability example
    import shap

    shap_explainer = shap.TreeExplainer(model['xgb'])
    shap_values = shap_explainer.shap_values(X_test)

    shap.summary_plot(shap_values,
                      X_test,
                      feature_names=model.feature_names,
                      show=False,
                      plot_size=(12, 8),
                      plot_type='bar')
    save_plot('SHAP Feature Importance')
Example #6
0
def train(data_conf, model_conf, **kwargs):
    hyperparams = model_conf["hyperParameters"]

    create_context(host=os.environ["AOA_CONN_HOST"],
                   username=os.environ["AOA_CONN_USERNAME"],
                   password=os.environ["AOA_CONN_PASSWORD"])

    feature_names = [
        "NumTimesPrg", "PlGlcConc", "BloodP", "SkinThick", "TwoHourSerIns",
        "BMI", "DiPedFunc", "Age"
    ]
    target_name = "HasDiabetes"

    # read training dataset from Teradata and convert to pandas
    train_df = DataFrame(data_conf["table"])
    train_df = train_df.select([feature_names + [target_name]])
    train_df = train_df.to_pandas()

    # split data into X and y
    X_train = train_df.drop(target_name, 1)
    y_train = train_df[target_name]

    print("Starting training...")

    # fit model to training data
    model = Pipeline([('scaler', MinMaxScaler()),
                      ('xgb',
                       XGBClassifier(eta=hyperparams["eta"],
                                     max_depth=hyperparams["max_depth"]))])
    # xgboost saves feature names but lets store on pipeline for easy access later
    model.feature_names = feature_names
    model.target_name = target_name

    model.fit(X_train, y_train)

    print("Finished training")

    # export model artefacts
    joblib.dump(model, "artifacts/output/model.joblib")

    # we can also save as pmml so it can be used for In-Vantage scoring etc.
    xgboost_to_pmml(pipeline=model,
                    col_names=feature_names,
                    target_name=target_name,
                    pmml_f_name="artifacts/output/model.pmml")

    print("Saved trained model")
Example #7
0
def evaluate(data_conf, model_conf, **kwargs):
    model_version = kwargs["model_version"]

    create_context(host=os.environ["AOA_CONN_HOST"],
                   username=os.environ["AOA_CONN_USERNAME"],
                   password=os.environ["AOA_CONN_PASSWORD"])

    def eval_partition(partition):
        model_artefact = partition.loc[partition['n_row'] == 1,
                                       'model_artefact'].iloc[0]
        model = dill.loads(base64.b64decode(model_artefact))

        X_test = partition[model.features]
        y_test = partition[['Y1']]

        y_pred = model.predict(X_test)

        partition_id = partition.partition_ID.iloc[0]

        # record whatever partition level information you want like rows, data stats, metrics, explainability, etc
        partition_metadata = json.dumps({
            "num_rows": partition.shape[0],
            "metrics": {
                "MAE":
                "{:.2f}".format(metrics.mean_absolute_error(y_test, y_pred)),
                "MSE":
                "{:.2f}".format(metrics.mean_squared_error(y_test, y_pred)),
                "R2": "{:.2f}".format(metrics.r2_score(y_test, y_pred))
            }
        })

        return np.array(
            [[partition_id, partition.shape[0], partition_metadata]])

    # we join the model artefact to the 1st row of the data table so we can load it in the partition
    query = f"""
    SELECT d.*, CASE WHEN n_row=1 THEN m.model_artefact ELSE null END AS model_artefact 
        FROM (SELECT x.*, ROW_NUMBER() OVER (PARTITION BY x.partition_id ORDER BY x.partition_id) AS n_row FROM {data_conf["table"]} x) AS d
        LEFT JOIN aoa_sto_models m
        ON d.partition_id = m.partition_id
        WHERE m.model_version = '{model_version}'
    """

    df = DistDataFrame(query=query,
                       dist_mode=DistMode.STO,
                       sto_id="model_eval")
    eval_df = df.map_partition(lambda partition: eval_partition(partition),
                               partition_by="partition_id",
                               returns=[["partition_id", "VARCHAR(255)"],
                                        ["num_rows", "BIGINT"],
                                        ["partition_metadata", "CLOB"]])

    # materialize as we reuse result
    eval_df = DataFrame(eval_df._table_name, materialize=True)

    save_metadata(eval_df)
    save_evaluation_metrics(eval_df, ["MAE", "MSE", "R2"])

    print("Finished evaluation")
def train(data_conf, model_conf, **kwargs):
    model_version = kwargs["model_version"]
    hyperparams = model_conf["hyperParameters"]

    create_context(host=os.environ["AOA_CONN_HOST"],
                   username=os.environ["AOA_CONN_USERNAME"],
                   password=os.environ["AOA_CONN_PASSWORD"],
                   database=data_conf["schema"] if "schema" in data_conf
                   and data_conf["schema"] != "" else None)

    cleanup_cli(model_version)

    def train_partition(partition, model_version, hyperparams):
        numeric_features = ["X" + str(i) for i in range(1, 10)]
        for i in numeric_features:
            partition[i] = partition[i].astype("float")

        numeric_transformer = Pipeline(
            steps=[("imputer", SimpleImputer(
                strategy="median")), ("scaler",
                                      RobustScaler()), ("pca", PCA(0.95))])

        categorical_features = ["flag"]
        for i in categorical_features:
            partition[i] = partition[i].astype("category")

        categorical_transformer = Pipeline(
            steps=[("imputer", SimpleImputer(strategy="constant", fill_value=0)
                    ), ("onehot", OneHotEncoder(handle_unknown="ignore"))])

        preprocessor = ColumnTransformer(
            transformers=[("num", numeric_transformer, numeric_features),
                          ("cat", categorical_transformer,
                           categorical_features)])

        features = numeric_features + categorical_features
        pipeline = Pipeline([
            ("preprocessor", preprocessor),
            ("rf", RandomForestRegressor(max_depth=hyperparams["max_depth"]))
        ])
        pipeline.fit(partition[features], partition[['Y1']])
        pipeline.features = features

        partition_id = partition.partition_ID.iloc[0]
        artefact = base64.b64encode(dill.dumps(pipeline))

        # record whatever partition level information you want like rows, data stats, explainability, etc
        partition_metadata = json.dumps({
            "num_rows": partition.shape[0],
            "hyper_parameters": hyperparams
        })

        return np.array([[
            partition_id, model_version, partition.shape[0],
            partition_metadata, artefact
        ]])

    print("Starting training...")

    query = "SELECT * FROM {table} WHERE fold_ID='train'".format(
        table=data_conf["table"])
    df = DistDataFrame(query=query,
                       dist_mode=DistMode.STO,
                       sto_id="model_train")
    model_df = df.map_partition(lambda partition: train_partition(
        partition, model_version, hyperparams),
                                partition_by="partition_id",
                                returns=[["partition_id", "VARCHAR(255)"],
                                         ["model_version", "VARCHAR(255)"],
                                         ["num_rows", "BIGINT"],
                                         ["partition_metadata", "CLOB"],
                                         ["model_artefact", "CLOB"]])
    # materialize as we reuse result
    model_df = DataFrame(model_df._table_name, materialize=True)

    # append to models table
    model_df.to_sql("aoa_sto_models", if_exists="append")

    save_metadata(model_df)

    print("Finished training")