def score(data_conf, model_conf, **kwargs): model_version = kwargs["model_version"] create_context(host=os.environ["AOA_CONN_HOST"], username=os.environ["AOA_CONN_USERNAME"], password=os.environ["AOA_CONN_PASSWORD"]) def score_partition(partition): model_artefact = partition.loc[partition['n_row'] == 1, 'model_artefact'].iloc[0] model = dill.loads(base64.b64decode(model_artefact)) X = partition[model.features] return model.predict(X) # we join the model artefact to the 1st row of the data table so we can load it in the partition query = f""" SELECT d.*, CASE WHEN n_row=1 THEN m.model_artefact ELSE null END AS model_artefact FROM (SELECT x.*, ROW_NUMBER() OVER (PARTITION BY x.partition_id ORDER BY x.partition_id) AS n_row FROM {data_conf["table"]} x) AS d LEFT JOIN aoa_sto_models m ON d.partition_id = m.partition_id WHERE m.model_version = '{model_version}' """ df = DistDataFrame(query=query, dist_mode=DistMode.STO, sto_id="my_model_score") scored_df = df.map_partition(lambda partition: score_partition(partition), partition_by="partition_id", returns=[["prediction", "VARCHAR(255)"]]) scored_df.to_sql(data_conf["predictions"], if_exists="append")
def evaluate(data_conf, model_conf, **kwargs): model_version = kwargs["model_version"] create_context(host=os.environ["AOA_CONN_HOST"], username=os.environ["AOA_CONN_USERNAME"], password=os.environ["AOA_CONN_PASSWORD"]) def eval_partition(partition): model_artefact = partition.loc[partition['n_row'] == 1, 'model_artefact'].iloc[0] model = dill.loads(base64.b64decode(model_artefact)) X_test = partition[model.features] y_test = partition[['Y1']] y_pred = model.predict(X_test) partition_id = partition.partition_ID.iloc[0] # record whatever partition level information you want like rows, data stats, metrics, explainability, etc partition_metadata = json.dumps({ "num_rows": partition.shape[0], "metrics": { "MAE": "{:.2f}".format(metrics.mean_absolute_error(y_test, y_pred)), "MSE": "{:.2f}".format(metrics.mean_squared_error(y_test, y_pred)), "R2": "{:.2f}".format(metrics.r2_score(y_test, y_pred)) } }) return np.array( [[partition_id, partition.shape[0], partition_metadata]]) # we join the model artefact to the 1st row of the data table so we can load it in the partition query = f""" SELECT d.*, CASE WHEN n_row=1 THEN m.model_artefact ELSE null END AS model_artefact FROM (SELECT x.*, ROW_NUMBER() OVER (PARTITION BY x.partition_id ORDER BY x.partition_id) AS n_row FROM {data_conf["table"]} x) AS d LEFT JOIN aoa_sto_models m ON d.partition_id = m.partition_id WHERE m.model_version = '{model_version}' """ df = DistDataFrame(query=query, dist_mode=DistMode.STO, sto_id="model_eval") eval_df = df.map_partition(lambda partition: eval_partition(partition), partition_by="partition_id", returns=[["partition_id", "VARCHAR(255)"], ["num_rows", "BIGINT"], ["partition_metadata", "CLOB"]]) # materialize as we reuse result eval_df = DataFrame(eval_df._table_name, materialize=True) save_metadata(eval_df) save_evaluation_metrics(eval_df, ["MAE", "MSE", "R2"]) print("Finished evaluation")
def train(data_conf, model_conf, **kwargs): model_version = kwargs["model_version"] hyperparams = model_conf["hyperParameters"] create_context(host=os.environ["AOA_CONN_HOST"], username=os.environ["AOA_CONN_USERNAME"], password=os.environ["AOA_CONN_PASSWORD"], database=data_conf["schema"] if "schema" in data_conf and data_conf["schema"] != "" else None) cleanup_cli(model_version) def train_partition(partition, model_version, hyperparams): numeric_features = ["X" + str(i) for i in range(1, 10)] for i in numeric_features: partition[i] = partition[i].astype("float") numeric_transformer = Pipeline( steps=[("imputer", SimpleImputer( strategy="median")), ("scaler", RobustScaler()), ("pca", PCA(0.95))]) categorical_features = ["flag"] for i in categorical_features: partition[i] = partition[i].astype("category") categorical_transformer = Pipeline( steps=[("imputer", SimpleImputer(strategy="constant", fill_value=0) ), ("onehot", OneHotEncoder(handle_unknown="ignore"))]) preprocessor = ColumnTransformer( transformers=[("num", numeric_transformer, numeric_features), ("cat", categorical_transformer, categorical_features)]) features = numeric_features + categorical_features pipeline = Pipeline([ ("preprocessor", preprocessor), ("rf", RandomForestRegressor(max_depth=hyperparams["max_depth"])) ]) pipeline.fit(partition[features], partition[['Y1']]) pipeline.features = features partition_id = partition.partition_ID.iloc[0] artefact = base64.b64encode(dill.dumps(pipeline)) # record whatever partition level information you want like rows, data stats, explainability, etc partition_metadata = json.dumps({ "num_rows": partition.shape[0], "hyper_parameters": hyperparams }) return np.array([[ partition_id, model_version, partition.shape[0], partition_metadata, artefact ]]) print("Starting training...") query = "SELECT * FROM {table} WHERE fold_ID='train'".format( table=data_conf["table"]) df = DistDataFrame(query=query, dist_mode=DistMode.STO, sto_id="model_train") model_df = df.map_partition(lambda partition: train_partition( partition, model_version, hyperparams), partition_by="partition_id", returns=[["partition_id", "VARCHAR(255)"], ["model_version", "VARCHAR(255)"], ["num_rows", "BIGINT"], ["partition_metadata", "CLOB"], ["model_artefact", "CLOB"]]) # materialize as we reuse result model_df = DataFrame(model_df._table_name, materialize=True) # append to models table model_df.to_sql("aoa_sto_models", if_exists="append") save_metadata(model_df) print("Finished training")