def explain(datasource, select, explainer, model_params, result_table, model): if model_params is None: model_params = {} summary_params = dict() for k in model_params: if k.startswith("summary."): summary_key = k.replace("summary.", "") summary_params[summary_key] = model_params[k] if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance(model, Model), "not supported model type %s" % type(model) bst = xgb.Booster() bst.load_model("my_model") if explainer == "XGBoostExplainer": xgb_native_explain(bst, datasource, result_table) else: # when explainer is "" or "TreeExplainer" use SHAP by default. shap_explain(bst, datasource, select, summary_params, result_table, model)
def submit_local_pred(datasource, select, result_table, pred_label_name, load): model = Model.load_from_db(datasource, load) if model.get_type() == EstimatorType.XGBOOST: xgboost_pred(datasource, select, result_table, pred_label_name, model) else: raise NotImplementedError("not implemented model type: %s" % model.get_type())
def evaluate_step(datasource, select, result_table, model, label_name, model_params, pai_table=None): if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance(model, Model), "not supported model type %s" % type(model) if model_params is None: model_params = {} validation_metrics = model_params.get("validation.metrics", "Accuracy") validation_metrics = [m.strip() for m in validation_metrics.split(',')] validation_steps = model_params.get("validation.steps", None) batch_size = model_params.get("validation.batch_size", 1) verbose = model_params.get("validation.verbose", 0) conn = db.connect_with_data_source(datasource) create_evaluate_table(conn, result_table, validation_metrics) conn.close() model_params = model.get_meta("attributes") train_fc_map = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] estimator_string = model.get_meta("class_name") save = "model_save" field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) feature_columns = compile_ir_feature_columns(train_fc_map, model.get_type()) train_label_desc.name = label_name label_meta = train_label_desc.to_dict(dtype_to_string=True) _evaluate(datasource=datasource, estimator_string=estimator_string, select=select, result_table=result_table, feature_columns=feature_columns, feature_column_names=feature_column_names, feature_metas=feature_metas, label_meta=label_meta, model_params=model_params, validation_metrics=validation_metrics, save=save, batch_size=batch_size, validation_steps=validation_steps, verbose=verbose, pai_table=pai_table)
def get_saved_model_type_and_estimator(datasource, model_name): """Get oss model type and estimator name, model can be: 1. PAI ML models: model is saved by pai 2. xgboost: on OSS with model file xgboost_model_desc 3. PAI tensorflow models: on OSS with meta file: tensorflow_model_desc Args: datasource: the DBMS connection URI. model_name: the model to get info Returns: If model is TensorFlow model, return type and estimator name If model is XGBoost, or other PAI model, just return model type """ # FIXME(typhoonzero): if the model not exist on OSS, assume it's a random # forest model should use a general method to fetch the model and see the # model type. meta = Model.load_metadata_from_db(datasource, model_name) return meta.get_type(), meta.get_meta("class_name")
def submit_local_pred(datasource, original_sql, select, model_name, label_column, model_params, result_table, user=""): model = Model.load_from_db(datasource, model_name) if model.get_type() == EstimatorType.XGBOOST: pred_func = xgboost_pred else: pred_func = tf_pred pred_func(datasource=datasource, select=select, result_table=result_table, pred_label_name=label_column, model=model)
def submit_local_explain(datasource, original_sql, select, model_name, model_params, result_table, explainer="TreeExplainer", user=""): model = Model.load_from_db(datasource, model_name) if model.get_type() == EstimatorType.XGBOOST: explain_func = xgboost_explain else: explain_func = tf_explain explain_func(datasource=datasource, select=select, explainer=explainer, model_params=model_params, result_table=result_table, model=model)
def submit_local_evaluate(datasource, original_sql, select, pred_label_name, model_name, model_params, result_table, user=""): model = Model.load_from_db(datasource, model_name) if model.get_type() == EstimatorType.XGBOOST: evaluate_func = xgboost_evaluate else: evaluate_func = tf_evaluate evaluate_func(datasource=datasource, select=select, result_table=result_table, model=model, pred_label_name=pred_label_name, model_params=model_params)
def evaluate(datasource, select, result_table, model, label_name=None, model_params=None, result_column_names=[], pai_table=None): """TBD """ if model_params is None: model_params = {} validation_metrics = model_params.get("validation.metrics", "accuracy_score") validation_metrics = [m.strip() for m in validation_metrics.split(",")] bst = xgb.Booster() if isinstance(model, six.string_types): with temp_file.TemporaryDirectory(as_cwd=True): model = Model.load_from_db(datasource, model) bst.load_model("my_model") else: assert isinstance(model, Model), "not supported model type %s" % type(model) bst.load_model("my_model") model_params = model.get_meta("attributes") fc_map_ir = model.get_meta("features") train_label = model.get_meta("label") train_label_desc = train_label.get_field_desc()[0] if label_name: train_label_desc.name = label_name feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) transform_fn = ComposedColumnTransformer( feature_column_names, *feature_columns["feature_columns"]) is_pai = True if pai_table else False if is_pai: conn = PaiIOConnection.from_table(pai_table) else: conn = db.connect_with_data_source(datasource) with temp_file.TemporaryDirectory() as tmp_dir_name: pred_fn = os.path.join(tmp_dir_name, "predict.txt") dpred = xgb_dataset( datasource=datasource, fn=pred_fn, dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=train_label_desc.to_dict(dtype_to_string=True), cache=True, batch_size=10000, transform_fn=transform_fn, is_pai=is_pai, pai_table=pai_table, pai_single_file=True, feature_column_code=fc_map_ir) for i, pred_dmatrix in enumerate(dpred): if is_pai: feature_file_name = pred_fn else: feature_file_name = pred_fn + "_%d" % i preds = _calc_predict_result(bst, pred_dmatrix, model_params) _store_evaluate_result(preds, feature_file_name, train_label_desc, result_table, result_column_names, validation_metrics, conn) conn.close()
def explain(datasource, select, explainer, model_params, result_table, model): """ Do explanation to a trained TensorFlow model. Args: datasource (str): the database connection string. select (str): the input data to predict. explainer (str): the explainer to explain the model. Not used in TensorFlow models. model_params (dict): the parameters for evaluation. result_table (str): the output data table. model (Model|str): the model object or where to load the model. Returns: None. """ if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance(model, Model), "not supported model type %s" % type(model) plot_type = model_params.get("summary.plot_type", "bar") train_attributes = model.get_meta("attributes") train_fc_map = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] estimator_string = model.get_meta("class_name") save = "model_save" field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) feature_columns = compile_ir_feature_columns(train_fc_map, model.get_type()) label_name = model_params.get("label_col", train_label_desc.name) train_label_desc.name = label_name label_meta = train_label_desc.to_dict(dtype_to_string=True) if result_table: conn = db.connect_with_data_source(datasource) if estimator_string.startswith("BoostedTrees"): column_defs = [ "feature %s" % DataType.to_db_field_type(conn.driver, DataType.STRING), "dfc %s" % DataType.to_db_field_type(conn.driver, DataType.FLOAT32), "gain %s" % DataType.to_db_field_type(conn.driver, DataType.FLOAT32), ] else: selected_cols = db.selected_cols(conn, select) if label_name in selected_cols: selected_cols.remove(label_name) name_to_shape = dict([(fd.name, fd.shape) for fd in field_descs]) column_defs = [] float_field_type = DataType.to_db_field_type( conn.driver, DataType.FLOAT32) for name in selected_cols: shape = name_to_shape.get(name, None) if shape is None: raise ValueError("cannot find column %s" % name) size = int(np.prod(shape)) if size == 1: column_def = "%s %s" % (name, float_field_type) column_defs.append(column_def) else: for i in six.moves.range(size): column_def = "%s_%d %s" % (name, i, float_field_type) column_defs.append(column_def) drop_sql = "DROP TABLE IF EXISTS %s;" % result_table create_sql = "CREATE TABLE %s (%s);" % (result_table, ",".join(column_defs)) conn.execute(drop_sql) conn.execute(create_sql) conn.close() _explain(datasource=datasource, estimator_string=estimator_string, select=select, feature_columns=feature_columns, feature_column_names=feature_column_names, feature_metas=feature_metas, label_meta=label_meta, model_params=train_attributes, save=save, plot_type=plot_type, result_table=result_table) with open('summary.png', 'rb') as f: img = f.read() img = base64.b64encode(img) if six.PY3: img = img.decode('utf-8') img = "<div align='center'><img src='data:image/png;base64,%s' /></div>" \ % img print(img)
def predict_step(datasource, select, result_table, label_name, model, pai_table=None): if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance(model, Model), "not supported model type %s" % type(model) model_params = model.get_meta("attributes") train_fc_map = model.get_meta("features") label_meta = model.get_meta("label") train_label_desc = label_meta.get_field_desc()[0] if label_meta else None train_label_name = train_label_desc.name if train_label_desc else None estimator_string = model.get_meta("class_name") save = "model_save" field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) feature_columns = compile_ir_feature_columns(train_fc_map, model.get_type()) is_pai = True if pai_table else False if is_pai: select = "SELECT * FROM %s" % pai_table conn = db.connect_with_data_source(datasource) result_column_names, train_label_idx = create_predict_table( conn, select, result_table, train_label_desc, label_name) if is_pai: conn.close() conn = PaiIOConnection.from_table(pai_table) select = None selected_cols = result_column_names[0:-1] if train_label_idx >= 0: selected_cols = selected_cols[0:train_label_idx] + [ train_label_name ] + selected_cols[train_label_idx:] estimator = import_model(estimator_string) model_params.update(feature_columns) is_estimator = is_tf_estimator(estimator) predict_generator = db.db_generator(conn, select) pop_optimizer_and_loss(model_params) if not is_estimator: if not issubclass(estimator, tf.keras.Model): # functional model need field_metas parameter model_params["field_metas"] = feature_metas print("Start predicting using keras model...") keras_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, train_label_name, label_name, conn, predict_generator, selected_cols) else: model_params['model_dir'] = save print("Start predicting using estimator model...") estimator_predict(result_table, feature_column_names, feature_metas, train_label_name, label_name, conn, predict_generator, selected_cols) print("Done predicting. Predict table : %s" % result_table) conn.close()
def evaluate(datasource, select, result_table, model, pred_label_name=None, model_params=None): """ Do evaluation to a trained TensorFlow model. Args: datasource (str): the database connection string. select (str): the input data to predict. result_table (str): the output data table. model (Model|str): the model object or where to load the model. pred_label_name (str): the label column name. model_params (dict): the parameters for evaluation. Returns: None. """ if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance(model, Model), "not supported model type %s" % type(model) validation_metrics = model_params.get("validation.metrics", "Accuracy") validation_metrics = [m.strip() for m in validation_metrics.split(',')] validation_steps = model_params.get("validation.steps", None) batch_size = model_params.get("validation.batch_size", 1) verbose = model_params.get("validation.verbose", 0) conn = db.connect_with_data_source(datasource) create_evaluate_table(conn, result_table, validation_metrics) conn.close() model_params = model.get_meta("attributes") train_fc_map = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] estimator_string = model.get_meta("class_name") save = "model_save" field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) feature_columns = compile_ir_feature_columns(train_fc_map, model.get_type()) train_label_desc.name = pred_label_name label_meta = train_label_desc.to_dict(dtype_to_string=True) _evaluate(datasource=datasource, estimator_string=estimator_string, select=select, result_table=result_table, feature_columns=feature_columns, feature_column_names=feature_column_names, feature_metas=feature_metas, label_meta=label_meta, model_params=model_params, validation_metrics=validation_metrics, save=save, batch_size=batch_size, validation_steps=validation_steps, verbose=verbose)
def submit_pai_predict(datasource, original_sql, select, model, label_name, pred_params, result_table, user=""): """This function pack needed params and resource to a tarball and submit a prediction task to PAI Args: datasource: string Like: maxcompute://ak:[email protected]/api? curr_project=test_ci&scheme=http original_sql: string Original "TO PREDICT" statement. select: string SQL statement to get prediction data set. model: string Model to load and do prediction. label_name: string Name of the label column, if not exist in select. pred_params: dict Params for training, crossponding to WITH clause. result_table: string The table name to save prediction result. user: string A string to identify the user, used to load model from the user's directory. """ params = dict(locals()) # format resultTable name to "db.table" to let the codegen form a # submitting argument of format "odps://project/tables/table_name" project = table_ops.get_project(datasource) if result_table.count(".") == 0: result_table = "%s.%s" % (project, result_table) model_metas = Model.load_metadata_from_db(datasource, model) model_type = model_metas.get_type() estimator = model_metas.get_meta("class_name") setup_predict_entry(params, model_type) train_label = model_metas.get_meta("label") if train_label is not None: train_label_desc = train_label.get_field_desc()[0] else: train_label_desc = None if pred_params is None: extra_result_cols = [] else: extra_result_cols = pred_params.get("predict.extra_outputs", "") extra_result_cols = [ c.strip() for c in extra_result_cols.split(",") if c.strip() ] with db.connect_with_data_source(datasource) as conn: result_column_names, train_label_idx = create_predict_table( conn, select, result_table, train_label_desc, label_name, extra_result_cols) oss_model_path = pai_model.get_oss_model_save_path(datasource, model, user=user) # TODO(typhoonzero): Do **NOT** create tmp table when the select statement # is like: "SELECT fields,... FROM table" with table_ops.create_tmp_tables_guard(select, datasource) as data_table: del params["label_name"] params["pai_table"] = data_table params["result_column_names"] = result_column_names params["train_label_idx"] = train_label_idx params["extra_result_cols"] = extra_result_cols if try_pai_local_run(params, oss_model_path): return with temp_file.TemporaryDirectory(prefix="sqlflow", dir="/tmp") as cwd: prepare_archive(cwd, estimator, oss_model_path, params) cmd = get_pai_predict_cmd( datasource, project, oss_model_path, model, data_table, result_table, model_type, pred_params, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE), "file://" + os.path.join(cwd, PARAMS_FILE)) submit_pai_task(cmd, datasource)
def explain_step(datasource, select, explainer, model_params, result_table, model, pai_table=None, oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None): """ Do explanation to a trained TensorFlow model. Args: datasource (str): the database connection string. select (str): the input data to predict. explainer (str): the explainer to explain the model. Not used in TensorFlow models. model_params (dict): the parameters for evaluation. result_table (str): the output data table. model (Model|str): the model object or where to load the model. Returns: None. """ if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance(model, Model), "not supported model type %s" % type(model) plot_type = model_params.get("summary.plot_type", "bar") train_attributes = model.get_meta("attributes") train_fc_map = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] estimator_string = model.get_meta("class_name") save = "model_save" field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) feature_columns = compile_ir_feature_columns(train_fc_map, model.get_type()) label_name = model_params.get("label_col", train_label_desc.name) train_label_desc.name = label_name label_meta = train_label_desc.to_dict(dtype_to_string=True) if pai_table: assert oss_dest, "oss_dest must be given when submit to PAI" else: assert oss_dest is None if os.environ.get('DISPLAY', '') == '': print('no display found. Using non-interactive Agg backend') matplotlib.use('Agg') _explain(datasource=datasource, estimator_string=estimator_string, select=select, feature_columns=feature_columns, feature_column_names=feature_column_names, feature_metas=feature_metas, label_meta=label_meta, model_params=train_attributes, save=save, pai_table=pai_table, plot_type=plot_type, result_table=result_table, oss_dest=oss_dest, oss_ak=oss_ak, oss_sk=oss_sk, oss_endpoint=oss_endpoint, oss_bucket_name=oss_bucket_name) print_image_as_base64_html('summary.png')
def submit_pai_explain(datasource, original_sql, select, model, model_params, result_table, explainer="TreeExplainer", user=""): """This function pack need params and resource to a tarball and submit a explain task to PAI Args: datasource: string Like: maxcompute://ak:[email protected]/api? curr_project=test_ci&scheme=http original_sql: string Original "TO PREDICT" statement. select: string SQL statement to get prediction data set. model: string Model to load and do prediction. model_params: dict Params for training, crossponding to WITH clause. result_table: string The table name to save prediction result. user: string A string to identify the user, used to load model from the user's directory. """ params = dict(locals()) # format resultTable name to "db.table" to let the codegen form a # submitting argument of format "odps://project/tables/table_name" project = table_ops.get_project(datasource) if result_table: if result_table.count(".") == 0: result_table = "%s.%s" % (project, result_table) params["result_table"] = result_table # used to save the explain image timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S") params["oss_dest"] = "explain_images/%s/%s" % (user, timestamp) add_env_to_params(params, "SQLFLOW_OSS_AK", "oss_ak") add_env_to_params(params, "SQLFLOW_OSS_SK", "oss_sk") add_env_to_params(params, "SQLFLOW_OSS_ALISA_ENDPOINT", "oss_endpoint") add_env_to_params(params, "SQLFLOW_OSS_ALISA_BUCKET", "oss_bucket_name") meta = Model.load_metadata_from_db(datasource, model) model_type = meta.get_type() estimator = meta.get_meta("class_name") label_name = model_params.get("label_col") if label_name is None: label_column = meta.get_meta("label") if label_column is not None: label_name = label_column.get_field_desc()[0].name setup_explain_entry(params, model_type) oss_model_path = pai_model.get_oss_model_save_path(datasource, model, user=user) # TODO(typhoonzero): Do **NOT** create tmp table when the select statement # is like: "SELECT fields,... FROM table" with table_ops.create_tmp_tables_guard(select, datasource) as data_table: params["pai_table"] = data_table # Create explain result table if result_table: conn = db.connect_with_data_source(datasource) feature_columns = meta.get_meta("features") estimator_string = meta.get_meta("class_name") field_descs = get_ordered_field_descs(feature_columns) feature_column_names = [fd.name for fd in field_descs] create_explain_table(conn, meta.get_type(), explainer, estimator_string, result_table, feature_column_names) conn.close() if not try_pai_local_run(params, oss_model_path): with temp_file.TemporaryDirectory(prefix="sqlflow", dir="/tmp") as cwd: prepare_archive(cwd, estimator, oss_model_path, params) cmd = get_pai_explain_cmd( datasource, project, oss_model_path, model, data_table, result_table, model_type, model_params, "file://" + os.path.join(cwd, JOB_ARCHIVE_FILE), "file://" + os.path.join(cwd, PARAMS_FILE), label_name) submit_pai_task(cmd, datasource) if result_table: print('Saved result into: {}'.format(result_table)) else: print_oss_image(params["oss_dest"], params["oss_ak"], params["oss_sk"], params["oss_endpoint"], params["oss_bucket_name"])
def evaluate(datasource, select, result_table, load, pred_label_name=None, validation_metrics=["accuracy_score"]): """ Do evaluation to a trained XGBoost model. Args: datasource (str): the database connection string. select (str): the input data to predict. result_table (str): the output data table. load (str): where the trained model stores. pred_label_name (str): the label column name. validation_metrics (list[str]): the evaluation metric names. Returns: None. """ model = Model.load_from_db(datasource, load) model_params = model.get_meta("attributes") train_fc_map = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] if pred_label_name: train_label_desc.name = pred_label_name field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs]) # NOTE: in the current implementation, we are generating a transform_fn # from the COLUMN clause. The transform_fn is executed during the process # of dumping the original data into DMatrix SVM file. compiled_fc = compile_ir_feature_columns(train_fc_map, model.get_type()) transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *compiled_fc["feature_columns"]) bst = xgb.Booster() bst.load_model("my_model") conn = db.connect_with_data_source(datasource) result_column_names = _create_evaluate_table(conn, result_table, validation_metrics) with temp_file.TemporaryDirectory() as tmp_dir_name: pred_fn = os.path.join(tmp_dir_name, "predict.txt") dpred = xgb_dataset(datasource=datasource, fn=pred_fn, dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=train_label_desc.to_dict(), cache=True, batch_size=10000, transform_fn=transform_fn) for i, pred_dmatrix in enumerate(dpred): feature_file_name = pred_fn + "_%d" % i preds = _calc_predict_result(bst, pred_dmatrix, model_params) _store_evaluate_result(preds, feature_file_name, train_label_desc, result_table, result_column_names, validation_metrics, conn) conn.close()