def score(data_conf, model_conf, **kwargs): model_version = kwargs["model_version"] create_context(host=os.environ["AOA_CONN_HOST"], username=os.environ["AOA_CONN_USERNAME"], password=os.environ["AOA_CONN_PASSWORD"]) def score_partition(partition): model_artefact = partition.loc[partition['n_row'] == 1, 'model_artefact'].iloc[0] model = dill.loads(base64.b64decode(model_artefact)) X = partition[model.features] return model.predict(X) # we join the model artefact to the 1st row of the data table so we can load it in the partition query = f""" SELECT d.*, CASE WHEN n_row=1 THEN m.model_artefact ELSE null END AS model_artefact FROM (SELECT x.*, ROW_NUMBER() OVER (PARTITION BY x.partition_id ORDER BY x.partition_id) AS n_row FROM {data_conf["table"]} x) AS d LEFT JOIN aoa_sto_models m ON d.partition_id = m.partition_id WHERE m.model_version = '{model_version}' """ df = DistDataFrame(query=query, dist_mode=DistMode.STO, sto_id="my_model_score") scored_df = df.map_partition(lambda partition: score_partition(partition), partition_by="partition_id", returns=[["prediction", "VARCHAR(255)"]]) scored_df.to_sql(data_conf["predictions"], if_exists="append")
def score(data_conf, model_conf, **kwargs): model = joblib.load('artifacts/input/model.joblib') # For demo purposes we read data from Vantage, but in a real environment # it can be anything that pyspark can read (csv, parquet, avro, etc...) create_context(host=os.environ["AOA_CONN_HOST"], username=os.environ["AOA_CONN_USERNAME"], password=os.environ["AOA_CONN_PASSWORD"], database=data_conf["schema"] if "schema" in data_conf and data_conf["schema"] != "" else None) predict_df = DataFrame(data_conf["table"]) # convert to pandas to use locally predict_df = predict_df.to_pandas() # do feature eng in spark / joins whatever reason you're using pyspark... print("Scoring") y_pred = model.predict(predict_df[model.feature_names]) print("Finished Scoring") # create result dataframe y_pred = pd.DataFrame(y_pred, columns=["pred"]) # wrap as pyspark df predictions = spark.createDataFrame(y_pred) # in a real world you would write the results back to HDFS, Teradata, S3 etc. predictions.write.mode("overwrite").save("/tmp/predictions") logging.info("Finished Saving Scoring")
def score(data_conf, model_conf, **kwargs): model = joblib.load("artifacts/input/model.joblib") create_context(host=os.environ["AOA_CONN_HOST"], username=os.environ["AOA_CONN_USERNAME"], password=os.environ["AOA_CONN_PASSWORD"], database=data_conf["schema"] if "schema" in data_conf and data_conf["schema"] != "" else None) predict_df = DataFrame(data_conf["table"]) # convert to pandas to use locally predict_df = predict_df.to_pandas() print("Scoring") y_pred = model.predict(predict_df[model.feature_names]) print("Finished Scoring") # create result dataframe and store in Teradata y_pred = pd.DataFrame(y_pred, columns=["pred"]) y_pred["PatientId"] = predict_df["PatientId"].values copy_to_sql(df=y_pred, table_name=data_conf["predictions"], index=False, if_exists="replace")
def evaluate(data_conf, model_conf, **kwargs): model_version = kwargs["model_version"] create_context(host=os.environ["AOA_CONN_HOST"], username=os.environ["AOA_CONN_USERNAME"], password=os.environ["AOA_CONN_PASSWORD"]) def eval_partition(partition): model_artefact = partition.loc[partition['n_row'] == 1, 'model_artefact'].iloc[0] model = dill.loads(base64.b64decode(model_artefact)) X_test = partition[model.features] y_test = partition[['Y1']] y_pred = model.predict(X_test) partition_id = partition.partition_ID.iloc[0] # record whatever partition level information you want like rows, data stats, metrics, explainability, etc partition_metadata = json.dumps({ "num_rows": partition.shape[0], "metrics": { "MAE": "{:.2f}".format(metrics.mean_absolute_error(y_test, y_pred)), "MSE": "{:.2f}".format(metrics.mean_squared_error(y_test, y_pred)), "R2": "{:.2f}".format(metrics.r2_score(y_test, y_pred)) } }) return np.array( [[partition_id, partition.shape[0], partition_metadata]]) # we join the model artefact to the 1st row of the data table so we can load it in the partition query = f""" SELECT d.*, CASE WHEN n_row=1 THEN m.model_artefact ELSE null END AS model_artefact FROM (SELECT x.*, ROW_NUMBER() OVER (PARTITION BY x.partition_id ORDER BY x.partition_id) AS n_row FROM {data_conf["table"]} x) AS d LEFT JOIN aoa_sto_models m ON d.partition_id = m.partition_id WHERE m.model_version = '{model_version}' """ df = DistDataFrame(query=query, dist_mode=DistMode.STO, sto_id="model_eval") eval_df = df.map_partition(lambda partition: eval_partition(partition), partition_by="partition_id", returns=[["partition_id", "VARCHAR(255)"], ["num_rows", "BIGINT"], ["partition_metadata", "CLOB"]]) # materialize as we reuse result eval_df = DataFrame(eval_df._table_name, materialize=True) save_metadata(eval_df) save_evaluation_metrics(eval_df, ["MAE", "MSE", "R2"]) print("Finished evaluation")
def evaluate(data_conf, model_conf, **kwargs): model = joblib.load('artifacts/input/model.joblib') create_context(host=os.environ["AOA_CONN_HOST"], username=os.environ["AOA_CONN_USERNAME"], password=os.environ["AOA_CONN_PASSWORD"], database=data_conf["schema"] if "schema" in data_conf and data_conf["schema"] != "" else None) # Read test dataset from Teradata # As this is for demo purposes, we simulate the test dataset changing between executions # by introducing a random sample. Note that the sampling is performed in Teradata! test_df = DataFrame(data_conf["table"]).sample(frac=0.8) test_pdf = test_df.to_pandas() X_test = test_pdf[model.feature_names] y_test = test_pdf[model.target_name] print("Scoring") y_pred = model.predict(test_pdf[model.feature_names]) evaluation = { 'Accuracy': '{:.2f}'.format(metrics.accuracy_score(y_test, y_pred)), 'Recall': '{:.2f}'.format(metrics.recall_score(y_test, y_pred)), 'Precision': '{:.2f}'.format(metrics.precision_score(y_test, y_pred)), 'f1-score': '{:.2f}'.format(metrics.f1_score(y_test, y_pred)) } with open("artifacts/output/metrics.json", "w+") as f: json.dump(evaluation, f) metrics.plot_confusion_matrix(model, X_test, y_test) save_plot('Confusion Matrix') metrics.plot_roc_curve(model, X_test, y_test) save_plot('ROC Curve') # xgboost has its own feature importance plot support but lets use shap as explainability example import shap shap_explainer = shap.TreeExplainer(model['xgb']) shap_values = shap_explainer.shap_values(X_test) shap.summary_plot(shap_values, X_test, feature_names=model.feature_names, show=False, plot_size=(12, 8), plot_type='bar') save_plot('SHAP Feature Importance') feature_importance = pd.DataFrame(list(zip(model.feature_names, np.abs(shap_values).mean(0))), columns=['col_name', 'feature_importance_vals']) feature_importance = feature_importance.set_index("col_name").T.to_dict(orient='records')[0] stats.record_stats(test_df, features=model.feature_names, predictors=["HasDiabetes"], categorical=["HasDiabetes"], importance=feature_importance, category_labels={"HasDiabetes": {0: "false", 1: "true"}})
def train(data_conf, model_conf, **kwargs): hyperparams = model_conf["hyperParameters"] create_context(host=os.environ["AOA_CONN_HOST"], username=os.environ["AOA_CONN_USERNAME"], password=os.environ["AOA_CONN_PASSWORD"], database=data_conf["schema"] if "schema" in data_conf and data_conf["schema"] != "" else None) feature_names = ["NumTimesPrg", "PlGlcConc", "BloodP", "SkinThick", "TwoHourSerIns", "BMI", "DiPedFunc", "Age"] target_name = "HasDiabetes" # read training dataset from Teradata and convert to pandas train_df = DataFrame(data_conf["table"]) train_df = train_df.select([feature_names + [target_name]]) train_pdf = train_df.to_pandas() # split data into X and y X_train = train_pdf.drop(target_name, 1) y_train = train_pdf[target_name] print("Starting training...") # fit model to training data model = Pipeline([('scaler', MinMaxScaler()), ('xgb', XGBClassifier(eta=hyperparams["eta"], max_depth=hyperparams["max_depth"]))]) # xgboost saves feature names but lets store on pipeline for easy access later model.feature_names = feature_names model.target_name = target_name model.fit(X_train, y_train) print("Finished training") # export model artefacts joblib.dump(model, "artifacts/output/model.joblib") # we can also save as pmml so it can be used for In-Vantage scoring etc. xgboost_to_pmml(pipeline=model, col_names=feature_names, target_name=target_name, pmml_f_name="artifacts/output/model.pmml") print("Saved trained model") from xgboost import plot_importance model["xgb"].get_booster().feature_names = feature_names plot_importance(model["xgb"].get_booster(), max_num_features=10) save_plot("feature_importance.png") feature_importance = model["xgb"].get_booster().get_score(importance_type="weight") stats.record_stats(train_df, features=feature_names, predictors=["HasDiabetes"], categorical=["HasDiabetes"], importance=feature_importance, category_labels={"HasDiabetes": {0: "false", 1: "true"}})
def connect(**kwargs): """Create connection""" if connected(): td.remove_context() try: td.create_context(**kwargs) except: # print(e) # something failed, try again td.create_context(**kwargs)
def evaluate(data_conf, model_conf, **kwargs): model = joblib.load('artifacts/input/model.joblib') create_context(host=os.environ["AOA_CONN_HOST"], username=os.environ["AOA_CONN_USERNAME"], password=os.environ["AOA_CONN_PASSWORD"]) # Read test dataset from Teradata # As this is for demo purposes, we simulate the test dataset changing between executions # by introducing a random sample. Note that the sampling is performed in Teradata! test_df = DataFrame(data_conf["table"]).sample(frac=0.8) test_df = test_df.to_pandas() X_test = test_df[model.feature_names] y_test = test_df[model.target_name] print("Scoring") y_pred = model.predict(test_df[model.feature_names]) evaluation = { 'Accuracy': '{:.2f}'.format(metrics.accuracy_score(y_test, y_pred)), 'Recall': '{:.2f}'.format(metrics.recall_score(y_test, y_pred)), 'Precision': '{:.2f}'.format(metrics.precision_score(y_test, y_pred)), 'f1-score': '{:.2f}'.format(metrics.f1_score(y_test, y_pred)) } with open("artifacts/output/metrics.json", "w+") as f: json.dump(evaluation, f) metrics.plot_confusion_matrix(model, X_test, y_test) save_plot('Confusion Matrix') metrics.plot_roc_curve(model, X_test, y_test) save_plot('ROC Curve') # xgboost has its own feature importance plot support but lets use shap as explainability example import shap shap_explainer = shap.TreeExplainer(model['xgb']) shap_values = shap_explainer.shap_values(X_test) shap.summary_plot(shap_values, X_test, feature_names=model.feature_names, show=False, plot_size=(12, 8), plot_type='bar') save_plot('SHAP Feature Importance')
def load_in_batches(dataframe, tableName, totalBatches): try: con = create_context(host='td........org', user=credentials.short_name_usr_ntid, password=credentials.user_ntid_pswrd, database='user_dw', logmech="LDAP") total_rows = len(dataframe.index) print('total_rows', total_rows) def split_equal(value, parts): value = float(value) return [int(i * value / parts) for i in range(1, parts + 1)] index_list = split_equal(total_rows, totalBatches) i = 0 for j in index_list: #print(j) print('Inserting Dataframe', 'df[' + str(i) + ':' + str(j) + ']') rows = dataframe[i:j] copy_to_sql(rows, table_name=tableName, if_exists='append') i = j + 1 finally: remove_context() # if connection not closed the script will hang
def train(data_conf, model_conf, **kwargs): hyperparams = model_conf["hyperParameters"] create_context(host=os.environ["AOA_CONN_HOST"], username=os.environ["AOA_CONN_USERNAME"], password=os.environ["AOA_CONN_PASSWORD"]) feature_names = [ "NumTimesPrg", "PlGlcConc", "BloodP", "SkinThick", "TwoHourSerIns", "BMI", "DiPedFunc", "Age" ] target_name = "HasDiabetes" # read training dataset from Teradata and convert to pandas train_df = DataFrame(data_conf["table"]) train_df = train_df.select([feature_names + [target_name]]) train_df = train_df.to_pandas() # split data into X and y X_train = train_df.drop(target_name, 1) y_train = train_df[target_name] print("Starting training...") # fit model to training data model = Pipeline([('scaler', MinMaxScaler()), ('xgb', XGBClassifier(eta=hyperparams["eta"], max_depth=hyperparams["max_depth"]))]) # xgboost saves feature names but lets store on pipeline for easy access later model.feature_names = feature_names model.target_name = target_name model.fit(X_train, y_train) print("Finished training") # export model artefacts joblib.dump(model, "artifacts/output/model.joblib") # we can also save as pmml so it can be used for In-Vantage scoring etc. xgboost_to_pmml(pipeline=model, col_names=feature_names, target_name=target_name, pmml_f_name="artifacts/output/model.pmml") print("Saved trained model")
def train(data_conf, model_conf, **kwargs): model_version = kwargs["model_version"] hyperparams = model_conf["hyperParameters"] create_context(host=os.environ["AOA_CONN_HOST"], username=os.environ["AOA_CONN_USERNAME"], password=os.environ["AOA_CONN_PASSWORD"], database=data_conf["schema"] if "schema" in data_conf and data_conf["schema"] != "" else None) cleanup_cli(model_version) def train_partition(partition, model_version, hyperparams): numeric_features = ["X" + str(i) for i in range(1, 10)] for i in numeric_features: partition[i] = partition[i].astype("float") numeric_transformer = Pipeline( steps=[("imputer", SimpleImputer( strategy="median")), ("scaler", RobustScaler()), ("pca", PCA(0.95))]) categorical_features = ["flag"] for i in categorical_features: partition[i] = partition[i].astype("category") categorical_transformer = Pipeline( steps=[("imputer", SimpleImputer(strategy="constant", fill_value=0) ), ("onehot", OneHotEncoder(handle_unknown="ignore"))]) preprocessor = ColumnTransformer( transformers=[("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features)]) features = numeric_features + categorical_features pipeline = Pipeline([ ("preprocessor", preprocessor), ("rf", RandomForestRegressor(max_depth=hyperparams["max_depth"])) ]) pipeline.fit(partition[features], partition[['Y1']]) pipeline.features = features partition_id = partition.partition_ID.iloc[0] artefact = base64.b64encode(dill.dumps(pipeline)) # record whatever partition level information you want like rows, data stats, explainability, etc partition_metadata = json.dumps({ "num_rows": partition.shape[0], "hyper_parameters": hyperparams }) return np.array([[ partition_id, model_version, partition.shape[0], partition_metadata, artefact ]]) print("Starting training...") query = "SELECT * FROM {table} WHERE fold_ID='train'".format( table=data_conf["table"]) df = DistDataFrame(query=query, dist_mode=DistMode.STO, sto_id="model_train") model_df = df.map_partition(lambda partition: train_partition( partition, model_version, hyperparams), partition_by="partition_id", returns=[["partition_id", "VARCHAR(255)"], ["model_version", "VARCHAR(255)"], ["num_rows", "BIGINT"], ["partition_metadata", "CLOB"], ["model_artefact", "CLOB"]]) # materialize as we reuse result model_df = DataFrame(model_df._table_name, materialize=True) # append to models table model_df.to_sql("aoa_sto_models", if_exists="append") save_metadata(model_df) print("Finished training")
plt.ylabel(yLabel) if yLabel is not None else plt.ylabel("Count") plt.xticks(index, data[xTicksColumnName], rotation=30) plt.title(title) if title is not None else plt.title("Bar Plot") plt.show() ### ### Connection ### # Establish connection to Teradata Vantage server (uses the Teradata SQL Driver # for Python). Before you execute the following statement, replace the variables # <HOSTNAME>, <UID> and <PWD> with your target Vantage system hostname (or # IP address), and your database user ID and password, respectively. td_context = create_context(host="<HOSTNAME>", username="******", password="******") # Notes and alternatives: # 1. In any connection function, you can specify for an argument the getpass() # function of the Python standard library. First, you will need to execute: # import getpass # getpass() enables you to type your password secretly during runtime # without having to hard-code it in the script. # Example: Specifying the argument password = getpass.getpass("Password: "******"Password: "******"LDAP" as follows:
def evaluate(data_conf, model_conf, **kwargs): model_version = kwargs["model_version"] model_id = kwargs["model_id"] engine = create_context(host=os.environ["AOA_CONN_HOST"], username=os.environ["AOA_CONN_USERNAME"], password=os.environ["AOA_CONN_PASSWORD"]) cursor = engine.raw_connection().cursor() conn = get_connection() with open("artifacts/input/model.pmml", "rb") as f: model_bytes = f.read() # we don't want to insert this into the models that can be used yet so add to temporary table and use there cursor.execute(""" CREATE VOLATILE TABLE ivsm_models_tmp( model_version VARCHAR(255), model_id VARCHAR(255), model BLOB(2097088000) ) ON COMMIT PRESERVE ROWS; """) cursor.execute( f"INSERT INTO ivsm_models_tmp(model_version, model_id, model) " "values(?,?,?)", (model_version, model_id, model_bytes)) scores = pd.read_sql( f""" SELECT PatientId, HasDiabetes as y_test, CAST(CAST(score_result AS JSON).JSONExtractValue('$.predicted_HasDiabetes') AS INT) as y_pred FROM IVSM_SCORE( ON (SELECT * FROM {data_conf["table"]}) AS DataTable ON (SELECT model_id, model FROM ivsm_models_tmp WHERE model_version = '{model_version}') AS ModelTable DIMENSION USING ModelID('{model_id}') ColumnsToPreserve('PatientId', 'HasDiabetes') ModelType('PMML') ) sc; """, conn) y_pred = scores[["y_pred"]] y_test = scores[["y_test"]] evaluation = { 'Accuracy': '{:.2f}'.format(metrics.accuracy_score(y_test, y_pred)), 'Recall': '{:.2f}'.format(metrics.recall_score(y_test, y_pred)), 'Precision': '{:.2f}'.format(metrics.precision_score(y_test, y_pred)), 'f1-score': '{:.2f}'.format(metrics.f1_score(y_test, y_pred)) } with open("artifacts/output/metrics.json", "w+") as f: json.dump(evaluation, f) # create confusion matrix plot cf = metrics.confusion_matrix(y_test, y_pred) plt.imshow(cf, cmap=plt.cm.Blues, interpolation='nearest') plt.colorbar() plt.title('Confusion Matrix') plt.xlabel('Predicted') plt.ylabel('Actual') plt.xticks([0, 1], ['0', '1']) plt.yticks([0, 1], ['0', '1']) thresh = cf.max() / 2. for i, j in itertools.product(range(cf.shape[0]), range(cf.shape[1])): plt.text(j, i, format(cf[i, j], 'd'), horizontalalignment='center', color='white' if cf[i, j] > thresh else 'black') fig = plt.gcf() fig.savefig('artifacts/output/confusion_matrix', dpi=500) plt.clf()
from datetime import datetime, timedelta from datetime import datetime from teradataml.context.context import * import ssl ssl._create_default_https_context = ssl._create_unverified_context from teradataml.analytics.mle.Arima import Arima from teradataml.analytics.mle.ArimaPredict import ArimaPredict from teradataml.analytics.mle.VarMax import VarMax import params # Connection vantage = create_context(host=params.MyHost, username=params.MyUser, password=params.Password, temp_database_name=params.SchemaName, logmech=params.LogMech) ############################################################# #Fetching Training Data for Cases in State ############################################################# vac1_u = "SELECT * from " + params.SchemaName + ".US_CONF_STATE_VAC_SMAVG" print(vac1_u) ############################################################# #Converting to Dataframe ############################################################# vac1_u_df = DataFrame.from_query(vac1_u)