Beispiel #1
0
def score(data_conf, model_conf, **kwargs):
    model_version = kwargs["model_version"]

    create_context(host=os.environ["AOA_CONN_HOST"],
                   username=os.environ["AOA_CONN_USERNAME"],
                   password=os.environ["AOA_CONN_PASSWORD"])

    def score_partition(partition):
        model_artefact = partition.loc[partition['n_row'] == 1,
                                       'model_artefact'].iloc[0]
        model = dill.loads(base64.b64decode(model_artefact))

        X = partition[model.features]

        return model.predict(X)

    # we join the model artefact to the 1st row of the data table so we can load it in the partition
    query = f"""
    SELECT d.*, CASE WHEN n_row=1 THEN m.model_artefact ELSE null END AS model_artefact 
        FROM (SELECT x.*, ROW_NUMBER() OVER (PARTITION BY x.partition_id ORDER BY x.partition_id) AS n_row FROM {data_conf["table"]} x) AS d
        LEFT JOIN aoa_sto_models m
        ON d.partition_id = m.partition_id
        WHERE m.model_version = '{model_version}'
    """

    df = DistDataFrame(query=query,
                       dist_mode=DistMode.STO,
                       sto_id="my_model_score")
    scored_df = df.map_partition(lambda partition: score_partition(partition),
                                 partition_by="partition_id",
                                 returns=[["prediction", "VARCHAR(255)"]])

    scored_df.to_sql(data_conf["predictions"], if_exists="append")
def score(data_conf, model_conf, **kwargs):
    model = joblib.load('artifacts/input/model.joblib')

    # For demo purposes we read data from Vantage, but in a real environment
    # it can be anything that pyspark can read (csv, parquet, avro, etc...)
    create_context(host=os.environ["AOA_CONN_HOST"],
                   username=os.environ["AOA_CONN_USERNAME"],
                   password=os.environ["AOA_CONN_PASSWORD"],
                   database=data_conf["schema"] if "schema" in data_conf
                   and data_conf["schema"] != "" else None)

    predict_df = DataFrame(data_conf["table"])

    # convert to pandas to use locally
    predict_df = predict_df.to_pandas()

    # do feature eng in spark / joins whatever reason you're using pyspark...

    print("Scoring")
    y_pred = model.predict(predict_df[model.feature_names])

    print("Finished Scoring")

    # create result dataframe
    y_pred = pd.DataFrame(y_pred, columns=["pred"])

    # wrap as pyspark df
    predictions = spark.createDataFrame(y_pred)

    # in a real world you would write the results back to HDFS, Teradata, S3 etc.
    predictions.write.mode("overwrite").save("/tmp/predictions")

    logging.info("Finished Saving Scoring")
Beispiel #3
0
def score(data_conf, model_conf, **kwargs):
    model = joblib.load("artifacts/input/model.joblib")

    create_context(host=os.environ["AOA_CONN_HOST"],
                   username=os.environ["AOA_CONN_USERNAME"],
                   password=os.environ["AOA_CONN_PASSWORD"],
                   database=data_conf["schema"] if "schema" in data_conf
                   and data_conf["schema"] != "" else None)

    predict_df = DataFrame(data_conf["table"])

    # convert to pandas to use locally
    predict_df = predict_df.to_pandas()

    print("Scoring")
    y_pred = model.predict(predict_df[model.feature_names])

    print("Finished Scoring")

    # create result dataframe and store in Teradata
    y_pred = pd.DataFrame(y_pred, columns=["pred"])
    y_pred["PatientId"] = predict_df["PatientId"].values
    copy_to_sql(df=y_pred,
                table_name=data_conf["predictions"],
                index=False,
                if_exists="replace")
Beispiel #4
0
def evaluate(data_conf, model_conf, **kwargs):
    model_version = kwargs["model_version"]

    create_context(host=os.environ["AOA_CONN_HOST"],
                   username=os.environ["AOA_CONN_USERNAME"],
                   password=os.environ["AOA_CONN_PASSWORD"])

    def eval_partition(partition):
        model_artefact = partition.loc[partition['n_row'] == 1,
                                       'model_artefact'].iloc[0]
        model = dill.loads(base64.b64decode(model_artefact))

        X_test = partition[model.features]
        y_test = partition[['Y1']]

        y_pred = model.predict(X_test)

        partition_id = partition.partition_ID.iloc[0]

        # record whatever partition level information you want like rows, data stats, metrics, explainability, etc
        partition_metadata = json.dumps({
            "num_rows": partition.shape[0],
            "metrics": {
                "MAE":
                "{:.2f}".format(metrics.mean_absolute_error(y_test, y_pred)),
                "MSE":
                "{:.2f}".format(metrics.mean_squared_error(y_test, y_pred)),
                "R2": "{:.2f}".format(metrics.r2_score(y_test, y_pred))
            }
        })

        return np.array(
            [[partition_id, partition.shape[0], partition_metadata]])

    # we join the model artefact to the 1st row of the data table so we can load it in the partition
    query = f"""
    SELECT d.*, CASE WHEN n_row=1 THEN m.model_artefact ELSE null END AS model_artefact 
        FROM (SELECT x.*, ROW_NUMBER() OVER (PARTITION BY x.partition_id ORDER BY x.partition_id) AS n_row FROM {data_conf["table"]} x) AS d
        LEFT JOIN aoa_sto_models m
        ON d.partition_id = m.partition_id
        WHERE m.model_version = '{model_version}'
    """

    df = DistDataFrame(query=query,
                       dist_mode=DistMode.STO,
                       sto_id="model_eval")
    eval_df = df.map_partition(lambda partition: eval_partition(partition),
                               partition_by="partition_id",
                               returns=[["partition_id", "VARCHAR(255)"],
                                        ["num_rows", "BIGINT"],
                                        ["partition_metadata", "CLOB"]])

    # materialize as we reuse result
    eval_df = DataFrame(eval_df._table_name, materialize=True)

    save_metadata(eval_df)
    save_evaluation_metrics(eval_df, ["MAE", "MSE", "R2"])

    print("Finished evaluation")
Beispiel #5
0
def evaluate(data_conf, model_conf, **kwargs):
    model = joblib.load('artifacts/input/model.joblib')

    create_context(host=os.environ["AOA_CONN_HOST"],
                   username=os.environ["AOA_CONN_USERNAME"],
                   password=os.environ["AOA_CONN_PASSWORD"],
                   database=data_conf["schema"] if "schema" in data_conf and data_conf["schema"] != "" else None)

    # Read test dataset from Teradata
    # As this is for demo purposes, we simulate the test dataset changing between executions
    # by introducing a random sample. Note that the sampling is performed in Teradata!
    test_df = DataFrame(data_conf["table"]).sample(frac=0.8)
    test_pdf = test_df.to_pandas()

    X_test = test_pdf[model.feature_names]
    y_test = test_pdf[model.target_name]

    print("Scoring")
    y_pred = model.predict(test_pdf[model.feature_names])

    evaluation = {
        'Accuracy': '{:.2f}'.format(metrics.accuracy_score(y_test, y_pred)),
        'Recall': '{:.2f}'.format(metrics.recall_score(y_test, y_pred)),
        'Precision': '{:.2f}'.format(metrics.precision_score(y_test, y_pred)),
        'f1-score': '{:.2f}'.format(metrics.f1_score(y_test, y_pred))
    }

    with open("artifacts/output/metrics.json", "w+") as f:
        json.dump(evaluation, f)

    metrics.plot_confusion_matrix(model, X_test, y_test)
    save_plot('Confusion Matrix')

    metrics.plot_roc_curve(model, X_test, y_test)
    save_plot('ROC Curve')

    # xgboost has its own feature importance plot support but lets use shap as explainability example
    import shap

    shap_explainer = shap.TreeExplainer(model['xgb'])
    shap_values = shap_explainer.shap_values(X_test)

    shap.summary_plot(shap_values, X_test, feature_names=model.feature_names,
                      show=False, plot_size=(12, 8), plot_type='bar')
    save_plot('SHAP Feature Importance')

    feature_importance = pd.DataFrame(list(zip(model.feature_names, np.abs(shap_values).mean(0))),
                                      columns=['col_name', 'feature_importance_vals'])
    feature_importance = feature_importance.set_index("col_name").T.to_dict(orient='records')[0]

    stats.record_stats(test_df,
                       features=model.feature_names,
                       predictors=["HasDiabetes"],
                       categorical=["HasDiabetes"],
                       importance=feature_importance,
                       category_labels={"HasDiabetes": {0: "false", 1: "true"}})
Beispiel #6
0
def train(data_conf, model_conf, **kwargs):
    hyperparams = model_conf["hyperParameters"]

    create_context(host=os.environ["AOA_CONN_HOST"],
                   username=os.environ["AOA_CONN_USERNAME"],
                   password=os.environ["AOA_CONN_PASSWORD"],
                   database=data_conf["schema"] if "schema" in data_conf and data_conf["schema"] != "" else None)

    feature_names = ["NumTimesPrg", "PlGlcConc", "BloodP", "SkinThick", "TwoHourSerIns", "BMI", "DiPedFunc", "Age"]
    target_name = "HasDiabetes"

    # read training dataset from Teradata and convert to pandas
    train_df = DataFrame(data_conf["table"])
    train_df = train_df.select([feature_names + [target_name]])
    train_pdf = train_df.to_pandas()

    # split data into X and y
    X_train = train_pdf.drop(target_name, 1)
    y_train = train_pdf[target_name]

    print("Starting training...")

    # fit model to training data
    model = Pipeline([('scaler', MinMaxScaler()),
                      ('xgb', XGBClassifier(eta=hyperparams["eta"],
                                            max_depth=hyperparams["max_depth"]))])
    # xgboost saves feature names but lets store on pipeline for easy access later
    model.feature_names = feature_names
    model.target_name = target_name

    model.fit(X_train, y_train)

    print("Finished training")

    # export model artefacts
    joblib.dump(model, "artifacts/output/model.joblib")

    # we can also save as pmml so it can be used for In-Vantage scoring etc.
    xgboost_to_pmml(pipeline=model, col_names=feature_names, target_name=target_name,
                    pmml_f_name="artifacts/output/model.pmml")

    print("Saved trained model")

    from xgboost import plot_importance
    model["xgb"].get_booster().feature_names = feature_names
    plot_importance(model["xgb"].get_booster(), max_num_features=10)
    save_plot("feature_importance.png")

    feature_importance = model["xgb"].get_booster().get_score(importance_type="weight")
    stats.record_stats(train_df,
                       features=feature_names,
                       predictors=["HasDiabetes"],
                       categorical=["HasDiabetes"],
                       importance=feature_importance,
                       category_labels={"HasDiabetes": {0: "false", 1: "true"}})
Beispiel #7
0
def connect(**kwargs):
    """Create connection"""
    if connected():
        td.remove_context()

    try:
        td.create_context(**kwargs)
    except:
        # print(e)
        # something failed, try again
        td.create_context(**kwargs)
Beispiel #8
0
def evaluate(data_conf, model_conf, **kwargs):
    model = joblib.load('artifacts/input/model.joblib')

    create_context(host=os.environ["AOA_CONN_HOST"],
                   username=os.environ["AOA_CONN_USERNAME"],
                   password=os.environ["AOA_CONN_PASSWORD"])

    # Read test dataset from Teradata
    # As this is for demo purposes, we simulate the test dataset changing between executions
    # by introducing a random sample. Note that the sampling is performed in Teradata!
    test_df = DataFrame(data_conf["table"]).sample(frac=0.8)
    test_df = test_df.to_pandas()

    X_test = test_df[model.feature_names]
    y_test = test_df[model.target_name]

    print("Scoring")
    y_pred = model.predict(test_df[model.feature_names])

    evaluation = {
        'Accuracy': '{:.2f}'.format(metrics.accuracy_score(y_test, y_pred)),
        'Recall': '{:.2f}'.format(metrics.recall_score(y_test, y_pred)),
        'Precision': '{:.2f}'.format(metrics.precision_score(y_test, y_pred)),
        'f1-score': '{:.2f}'.format(metrics.f1_score(y_test, y_pred))
    }

    with open("artifacts/output/metrics.json", "w+") as f:
        json.dump(evaluation, f)

    metrics.plot_confusion_matrix(model, X_test, y_test)
    save_plot('Confusion Matrix')

    metrics.plot_roc_curve(model, X_test, y_test)
    save_plot('ROC Curve')

    # xgboost has its own feature importance plot support but lets use shap as explainability example
    import shap

    shap_explainer = shap.TreeExplainer(model['xgb'])
    shap_values = shap_explainer.shap_values(X_test)

    shap.summary_plot(shap_values,
                      X_test,
                      feature_names=model.feature_names,
                      show=False,
                      plot_size=(12, 8),
                      plot_type='bar')
    save_plot('SHAP Feature Importance')
def load_in_batches(dataframe, tableName, totalBatches):
    try:
        con = create_context(host='td........org',
                             user=credentials.short_name_usr_ntid,
                             password=credentials.user_ntid_pswrd,
                             database='user_dw',
                             logmech="LDAP")

        total_rows = len(dataframe.index)
        print('total_rows', total_rows)

        def split_equal(value, parts):
            value = float(value)
            return [int(i * value / parts) for i in range(1, parts + 1)]

        index_list = split_equal(total_rows, totalBatches)

        i = 0
        for j in index_list:
            #print(j)
            print('Inserting Dataframe', 'df[' + str(i) + ':' + str(j) + ']')
            rows = dataframe[i:j]
            copy_to_sql(rows, table_name=tableName, if_exists='append')
            i = j + 1

    finally:
        remove_context()  # if connection not closed the script will hang
Beispiel #10
0
def train(data_conf, model_conf, **kwargs):
    hyperparams = model_conf["hyperParameters"]

    create_context(host=os.environ["AOA_CONN_HOST"],
                   username=os.environ["AOA_CONN_USERNAME"],
                   password=os.environ["AOA_CONN_PASSWORD"])

    feature_names = [
        "NumTimesPrg", "PlGlcConc", "BloodP", "SkinThick", "TwoHourSerIns",
        "BMI", "DiPedFunc", "Age"
    ]
    target_name = "HasDiabetes"

    # read training dataset from Teradata and convert to pandas
    train_df = DataFrame(data_conf["table"])
    train_df = train_df.select([feature_names + [target_name]])
    train_df = train_df.to_pandas()

    # split data into X and y
    X_train = train_df.drop(target_name, 1)
    y_train = train_df[target_name]

    print("Starting training...")

    # fit model to training data
    model = Pipeline([('scaler', MinMaxScaler()),
                      ('xgb',
                       XGBClassifier(eta=hyperparams["eta"],
                                     max_depth=hyperparams["max_depth"]))])
    # xgboost saves feature names but lets store on pipeline for easy access later
    model.feature_names = feature_names
    model.target_name = target_name

    model.fit(X_train, y_train)

    print("Finished training")

    # export model artefacts
    joblib.dump(model, "artifacts/output/model.joblib")

    # we can also save as pmml so it can be used for In-Vantage scoring etc.
    xgboost_to_pmml(pipeline=model,
                    col_names=feature_names,
                    target_name=target_name,
                    pmml_f_name="artifacts/output/model.pmml")

    print("Saved trained model")
def train(data_conf, model_conf, **kwargs):
    model_version = kwargs["model_version"]
    hyperparams = model_conf["hyperParameters"]

    create_context(host=os.environ["AOA_CONN_HOST"],
                   username=os.environ["AOA_CONN_USERNAME"],
                   password=os.environ["AOA_CONN_PASSWORD"],
                   database=data_conf["schema"] if "schema" in data_conf
                   and data_conf["schema"] != "" else None)

    cleanup_cli(model_version)

    def train_partition(partition, model_version, hyperparams):
        numeric_features = ["X" + str(i) for i in range(1, 10)]
        for i in numeric_features:
            partition[i] = partition[i].astype("float")

        numeric_transformer = Pipeline(
            steps=[("imputer", SimpleImputer(
                strategy="median")), ("scaler",
                                      RobustScaler()), ("pca", PCA(0.95))])

        categorical_features = ["flag"]
        for i in categorical_features:
            partition[i] = partition[i].astype("category")

        categorical_transformer = Pipeline(
            steps=[("imputer", SimpleImputer(strategy="constant", fill_value=0)
                    ), ("onehot", OneHotEncoder(handle_unknown="ignore"))])

        preprocessor = ColumnTransformer(
            transformers=[("num", numeric_transformer, numeric_features),
                          ("cat", categorical_transformer,
                           categorical_features)])

        features = numeric_features + categorical_features
        pipeline = Pipeline([
            ("preprocessor", preprocessor),
            ("rf", RandomForestRegressor(max_depth=hyperparams["max_depth"]))
        ])
        pipeline.fit(partition[features], partition[['Y1']])
        pipeline.features = features

        partition_id = partition.partition_ID.iloc[0]
        artefact = base64.b64encode(dill.dumps(pipeline))

        # record whatever partition level information you want like rows, data stats, explainability, etc
        partition_metadata = json.dumps({
            "num_rows": partition.shape[0],
            "hyper_parameters": hyperparams
        })

        return np.array([[
            partition_id, model_version, partition.shape[0],
            partition_metadata, artefact
        ]])

    print("Starting training...")

    query = "SELECT * FROM {table} WHERE fold_ID='train'".format(
        table=data_conf["table"])
    df = DistDataFrame(query=query,
                       dist_mode=DistMode.STO,
                       sto_id="model_train")
    model_df = df.map_partition(lambda partition: train_partition(
        partition, model_version, hyperparams),
                                partition_by="partition_id",
                                returns=[["partition_id", "VARCHAR(255)"],
                                         ["model_version", "VARCHAR(255)"],
                                         ["num_rows", "BIGINT"],
                                         ["partition_metadata", "CLOB"],
                                         ["model_artefact", "CLOB"]])
    # materialize as we reuse result
    model_df = DataFrame(model_df._table_name, materialize=True)

    # append to models table
    model_df.to_sql("aoa_sto_models", if_exists="append")

    save_metadata(model_df)

    print("Finished training")
Beispiel #12
0
    plt.ylabel(yLabel) if yLabel is not None else plt.ylabel("Count")
    plt.xticks(index, data[xTicksColumnName], rotation=30)
    plt.title(title) if title is not None else plt.title("Bar Plot")
    plt.show()


###
### Connection
###

# Establish connection to Teradata Vantage server (uses the Teradata SQL Driver
# for Python). Before you execute the following statement, replace the variables
# <HOSTNAME>, <UID> and <PWD> with your target Vantage system hostname (or
# IP address), and your database user ID and password, respectively.
td_context = create_context(host="<HOSTNAME>",
                            username="******",
                            password="******")

# Notes and alternatives:
# 1. In any connection function, you can specify for an argument the getpass()
#    function of the Python standard library. First, you will need to execute:
#    import getpass
#    getpass() enables you to type your password secretly during runtime
#    without having to hard-code it in the script.
# Example: Specifying the argument password = getpass.getpass("Password: "******"Password: "******"LDAP" as follows:
Beispiel #13
0
def evaluate(data_conf, model_conf, **kwargs):
    model_version = kwargs["model_version"]
    model_id = kwargs["model_id"]

    engine = create_context(host=os.environ["AOA_CONN_HOST"],
                            username=os.environ["AOA_CONN_USERNAME"],
                            password=os.environ["AOA_CONN_PASSWORD"])

    cursor = engine.raw_connection().cursor()
    conn = get_connection()

    with open("artifacts/input/model.pmml", "rb") as f:
        model_bytes = f.read()

    # we don't want to insert this into the models that can be used yet so add to temporary table and use there
    cursor.execute("""
    CREATE VOLATILE TABLE ivsm_models_tmp(
        model_version VARCHAR(255),
        model_id VARCHAR(255),
        model BLOB(2097088000)
    ) ON COMMIT PRESERVE ROWS;
    """)
    cursor.execute(
        f"INSERT INTO ivsm_models_tmp(model_version, model_id, model) "
        "values(?,?,?)", (model_version, model_id, model_bytes))

    scores = pd.read_sql(
        f"""
    SELECT PatientId, HasDiabetes as y_test, CAST(CAST(score_result AS JSON).JSONExtractValue('$.predicted_HasDiabetes') AS INT) as y_pred FROM IVSM_SCORE(
                ON (SELECT * FROM {data_conf["table"]}) AS DataTable
                ON (SELECT model_id, model FROM ivsm_models_tmp WHERE model_version = '{model_version}') AS ModelTable DIMENSION
                USING
                    ModelID('{model_id}')
                    ColumnsToPreserve('PatientId', 'HasDiabetes')
                    ModelType('PMML')
            ) sc;
    """, conn)

    y_pred = scores[["y_pred"]]
    y_test = scores[["y_test"]]

    evaluation = {
        'Accuracy': '{:.2f}'.format(metrics.accuracy_score(y_test, y_pred)),
        'Recall': '{:.2f}'.format(metrics.recall_score(y_test, y_pred)),
        'Precision': '{:.2f}'.format(metrics.precision_score(y_test, y_pred)),
        'f1-score': '{:.2f}'.format(metrics.f1_score(y_test, y_pred))
    }

    with open("artifacts/output/metrics.json", "w+") as f:
        json.dump(evaluation, f)

    # create confusion matrix plot
    cf = metrics.confusion_matrix(y_test, y_pred)

    plt.imshow(cf, cmap=plt.cm.Blues, interpolation='nearest')
    plt.colorbar()
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.xticks([0, 1], ['0', '1'])
    plt.yticks([0, 1], ['0', '1'])

    thresh = cf.max() / 2.
    for i, j in itertools.product(range(cf.shape[0]), range(cf.shape[1])):
        plt.text(j,
                 i,
                 format(cf[i, j], 'd'),
                 horizontalalignment='center',
                 color='white' if cf[i, j] > thresh else 'black')

    fig = plt.gcf()
    fig.savefig('artifacts/output/confusion_matrix', dpi=500)
    plt.clf()
from datetime import datetime, timedelta
from datetime import datetime
from teradataml.context.context import *
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

from teradataml.analytics.mle.Arima import Arima
from teradataml.analytics.mle.ArimaPredict import ArimaPredict
from teradataml.analytics.mle.VarMax import VarMax
import params

# Connection

vantage = create_context(host=params.MyHost,
                         username=params.MyUser,
                         password=params.Password,
                         temp_database_name=params.SchemaName,
                         logmech=params.LogMech)

#############################################################
#Fetching Training Data for Cases in State
#############################################################

vac1_u = "SELECT * from " + params.SchemaName + ".US_CONF_STATE_VAC_SMAVG"
print(vac1_u)
#############################################################
#Converting to Dataframe
#############################################################

vac1_u_df = DataFrame.from_query(vac1_u)