def train(original_sql, model_image, estimator_string, datasource, select, validation_select, model_params, train_params, validation_params, feature_column_map, label_column, save, load=None): if load: Model.load_from_db(datasource, load) load = "model_save" else: load = None train_step(original_sql=original_sql, model_image=model_image, estimator_string=estimator_string, datasource=datasource, select=select, validation_select=validation_select, model_params=model_params, train_params=train_params, validation_params=validation_params, feature_column_map=feature_column_map, label_column=label_column, save=save, load=load)
def explain(datasource, select, explainer, model_params, result_table, model): if model_params is None: model_params = {} summary_params = dict() for k in model_params: if k.startswith("summary."): summary_key = k.replace("summary.", "") summary_params[summary_key] = model_params[k] if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance(model, Model), "not supported model type %s" % type(model) bst = xgb.Booster() bst.load_model("my_model") if explainer == "XGBoostExplainer": xgb_native_explain(bst, datasource, result_table) else: # when explainer is "" or "TreeExplainer" use SHAP by default. shap_explain(bst, datasource, select, summary_params, result_table, model)
def submit_local_explain(datasource, original_sql, select, model, model_params, result_table, explainer="TreeExplainer", user=""): model = Model.load_from_db(datasource, model) if model.get_type() == EstimatorType.XGBOOST: explain_func = xgboost_explain else: explain_func = tf_explain if result_table: feature_columns = model.get_meta("features") estimator_string = model.get_meta("class_name") field_descs = get_ordered_field_descs(feature_columns) feature_column_names = [fd.name for fd in field_descs] with db.connect_with_data_source(datasource) as conn: create_explain_table(conn, model.get_type(), explainer, estimator_string, result_table, feature_column_names) explain_func(datasource=datasource, select=select, explainer=explainer, model_params=model_params, result_table=result_table, model=model) if not result_table: print_image_as_base64_html("summary.png")
def submit_local_pred(datasource, select, result_table, pred_label_name, load): model = Model.load_from_db(datasource, load) if model.get_type() == EstimatorType.XGBOOST: xgboost_pred(datasource, select, result_table, pred_label_name, model) else: raise NotImplementedError("not implemented model type: %s" % model.get_type())
def submit_local_evaluate(datasource, original_sql, select, label_name, model, model_params, result_table, user=""): model = Model.load_from_db(datasource, model) if model.get_type() == EstimatorType.XGBOOST: evaluate_func = xgboost_evaluate validation_metrics = model_params.get("validation.metrics", "accuracy_score") else: evaluate_func = tf_evaluate validation_metrics = model_params.get("validation.metrics", "Accuracy") conn = db.connect_with_data_source(datasource) validation_metrics = [m.strip() for m in validation_metrics.split(",")] result_column_names = create_evaluate_table(conn, result_table, validation_metrics) conn.close() evaluate_func(datasource=datasource, select=select, result_table=result_table, model=model, label_name=label_name, model_params=model_params, result_column_names=result_column_names)
def evaluate_step(datasource, select, result_table, model, label_name, model_params, pai_table=None): if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance(model, Model), "not supported model type %s" % type(model) if model_params is None: model_params = {} validation_metrics = model_params.get("validation.metrics", "Accuracy") validation_metrics = [m.strip() for m in validation_metrics.split(',')] validation_steps = model_params.get("validation.steps", None) batch_size = model_params.get("validation.batch_size", 1) verbose = model_params.get("validation.verbose", 0) conn = db.connect_with_data_source(datasource) create_evaluate_table(conn, result_table, validation_metrics) conn.close() model_params = model.get_meta("attributes") train_fc_map = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] estimator_string = model.get_meta("class_name") save = "model_save" field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) feature_columns = compile_ir_feature_columns(train_fc_map, model.get_type()) train_label_desc.name = label_name label_meta = train_label_desc.to_dict(dtype_to_string=True) _evaluate(datasource=datasource, estimator_string=estimator_string, select=select, result_table=result_table, feature_columns=feature_columns, feature_column_names=feature_column_names, feature_metas=feature_metas, label_meta=label_meta, model_params=model_params, validation_metrics=validation_metrics, save=save, batch_size=batch_size, validation_steps=validation_steps, verbose=verbose, pai_table=pai_table)
def submit_local_pred(datasource, original_sql, select, model_name, label_column, model_params, result_table, user=""): model = Model.load_from_db(datasource, model_name) if model.get_type() == EstimatorType.XGBOOST: pred_func = xgboost_pred else: pred_func = tf_pred pred_func(datasource=datasource, select=select, result_table=result_table, pred_label_name=label_column, model=model)
def submit_local_explain(datasource, original_sql, select, model_name, model_params, result_table, explainer="TreeExplainer", user=""): model = Model.load_from_db(datasource, model_name) if model.get_type() == EstimatorType.XGBOOST: explain_func = xgboost_explain else: explain_func = tf_explain explain_func(datasource=datasource, select=select, explainer=explainer, model_params=model_params, result_table=result_table, model=model)
def submit_local_evaluate(datasource, original_sql, select, pred_label_name, model_name, model_params, result_table, user=""): model = Model.load_from_db(datasource, model_name) if model.get_type() == EstimatorType.XGBOOST: evaluate_func = xgboost_evaluate else: evaluate_func = tf_evaluate evaluate_func(datasource=datasource, select=select, result_table=result_table, model=model, pred_label_name=pred_label_name, model_params=model_params)
def submit_local_pred(datasource, original_sql, select, model, label_name, pred_params, result_table, user=""): model = Model.load_from_db(datasource, model) if model.get_type() == EstimatorType.XGBOOST: pred_func = xgboost_pred else: pred_func = tf_pred if model.get_meta("label") is None: train_label_desc = None else: train_label_desc = model.get_meta("label").get_field_desc()[0] if pred_params is None: extra_result_cols = [] else: extra_result_cols = pred_params.get("predict.extra_outputs", "") extra_result_cols = [ c.strip() for c in extra_result_cols.split(",") if c.strip() ] with db.connect_with_data_source(datasource) as conn: result_column_names, train_label_idx = create_predict_table( conn, select, result_table, train_label_desc, label_name, extra_result_cols) pred_func(datasource=datasource, select=select, result_table=result_table, result_column_names=result_column_names, train_label_idx=train_label_idx, model=model, extra_result_cols=extra_result_cols)
def explain(datasource, select, explainer, model_params, result_table, model, pai_table="", oss_model_path="", oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None): """TBD """ if model_params is None: model_params = {} summary_params = dict() for k in model_params: if k.startswith("summary."): summary_key = k.replace("summary.", "") summary_params[summary_key] = model_params[k] bst = xgb.Booster() if isinstance(model, six.string_types): with temp_file.TemporaryDirectory(as_cwd=True): model = Model.load_from_db(datasource, model) bst.load_model("my_model") else: assert isinstance(model, Model), "not supported model type %s" % type(model) bst.load_model("my_model") fc_map_ir = model.get_meta("features") label_meta = model.get_meta("label").get_field_desc()[0].to_dict( dtype_to_string=True) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) is_pai = True if pai_table else False # NOTE: in the current implementation, we are generating a transform_fn # from the COLUMN clause. The transform_fn is executed during the process # of dumping the original data into DMatrix SVM file. compiled_fc = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *compiled_fc["feature_columns"]) dataset = xgb_shap_dataset(datasource, select, feature_column_names, label_meta, feature_metas, is_pai, pai_table, transform_fn) if explainer == "XGBoostExplainer": xgb_native_explain(bst, datasource, result_table) else: # when explainer is "" or "TreeExplainer" use SHAP by default. shap_explain(bst, datasource, dataset, summary_params, result_table, is_pai=is_pai, oss_dest=oss_dest, oss_ak=oss_ak, oss_sk=oss_sk, oss_endpoint=oss_endpoint, oss_bucket_name=oss_bucket_name)
def explain(datasource, select, explainer, model_params, result_table, model, pai_table="", oss_model_path=""): """Do XGBoost model explanation, this function use selected data to explain the model stored at oss_model_path Args: datasource: The datasource to load explain data select: SQL statement to get the data set data_table: tmp table to save the explain data result_table: table to store the explanation result label_column: name of the label column oss_model_path: path to the model to be explained """ if model_params is None: model_params = {} summary_params = dict() for k in model_params: if k.startswith("summary."): summary_key = k.replace("summary.", "") summary_params[summary_key] = model_params[k] is_pai = True if pai_table != "" else False if is_pai: # NOTE(typhoonzero): the xgboost model file "my_model" is hard coded # in xgboost/train.py oss.load_file(oss_model_path, "my_model") (estimator, model_params, train_params, feature_field_meta, feature_column_names, label_desc, fc_map_ir) = oss.load_metas(oss_model_path, "xgboost_model_desc") label_meta = label_desc.to_dict(dtype_to_string=True) else: if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance( model, Model), "not supported model type %s" % type(model) fc_map_ir = model.get_meta("features") label_meta = model.get_meta("label").get_field_desc()[0].to_dict( dtype_to_string=True) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) # NOTE: in the current implementation, we are generating a transform_fn # from the COLUMN clause. The transform_fn is executed during the process # of dumping the original data into DMatrix SVM file. compiled_fc = compile_ir_feature_columns(fc_map_ir, model.get_type()) transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *compiled_fc["feature_columns"]) dataset = xgb_shap_dataset(datasource, select, feature_column_names, label_meta, feature_metas, transform_fn) bst = xgb.Booster() bst.load_model("my_model") if explainer == "XGBoostExplainer": xgb_native_explain(bst, datasource, result_table) else: # when explainer is "" or "TreeExplainer" use SHAP by default. shap_explain(bst, datasource, dataset, summary_params, result_table)
def pred(datasource, select, result_table, pred_label_name, model): """ Do prediction using a trained model. Args: datasource (str): the database connection string. select (str): the input data to predict. result_table (str): the output data table. pred_label_name (str): the output label name to predict. model (Model|str): the model object or where to load the model. Returns: None. """ if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance(model, Model), "not supported model type %s" % type(model) model_params = model.get_meta("attributes") train_fc_map = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) # NOTE: in the current implementation, we are generating a transform_fn # from the COLUMN clause. The transform_fn is executed during the process # of dumping the original data into DMatrix SVM file. compiled_fc = compile_ir_feature_columns(train_fc_map, model.get_type()) transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *compiled_fc["feature_columns"]) bst = xgb.Booster() bst.load_model("my_model") conn = db.connect_with_data_source(datasource) result_column_names, train_label_idx = create_predict_table( conn, select, result_table, train_label_desc, pred_label_name) with temp_file.TemporaryDirectory() as tmp_dir_name: pred_fn = os.path.join(tmp_dir_name, "predict.txt") raw_data_dir = os.path.join(tmp_dir_name, "predict_raw_dir") dpred = xgb_dataset( datasource=datasource, fn=pred_fn, dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=None, cache=True, batch_size=10000, transform_fn=transform_fn, raw_data_dir=raw_data_dir) # NOTE: default to use external memory print("Start predicting XGBoost model...") for idx, pred_dmatrix in enumerate(dpred): feature_file_name = os.path.join( tmp_dir_name, "predict_raw_dir/predict.txt_%d" % idx) preds = _calc_predict_result(bst, pred_dmatrix, model_params) _store_predict_result(preds, result_table, result_column_names, train_label_idx, feature_file_name, conn) print("Done predicting. Predict table : %s" % result_table) conn.close()
def explain_step(datasource, select, explainer, model_params, result_table, model, pai_table=None, oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None): """ Do explanation to a trained TensorFlow model. Args: datasource (str): the database connection string. select (str): the input data to predict. explainer (str): the explainer to explain the model. Not used in TensorFlow models. model_params (dict): the parameters for evaluation. result_table (str): the output data table. model (Model|str): the model object or where to load the model. Returns: None. """ if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance(model, Model), "not supported model type %s" % type(model) plot_type = model_params.get("summary.plot_type", "bar") train_attributes = model.get_meta("attributes") train_fc_map = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] estimator_string = model.get_meta("class_name") save = "model_save" field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) feature_columns = compile_ir_feature_columns(train_fc_map, model.get_type()) label_name = model_params.get("label_col", train_label_desc.name) train_label_desc.name = label_name label_meta = train_label_desc.to_dict(dtype_to_string=True) if pai_table: assert oss_dest, "oss_dest must be given when submit to PAI" else: assert oss_dest is None if os.environ.get('DISPLAY', '') == '': print('no display found. Using non-interactive Agg backend') matplotlib.use('Agg') _explain(datasource=datasource, estimator_string=estimator_string, select=select, feature_columns=feature_columns, feature_column_names=feature_column_names, feature_metas=feature_metas, label_meta=label_meta, model_params=train_attributes, save=save, pai_table=pai_table, plot_type=plot_type, result_table=result_table, oss_dest=oss_dest, oss_ak=oss_ak, oss_sk=oss_sk, oss_endpoint=oss_endpoint, oss_bucket_name=oss_bucket_name) print_image_as_base64_html('summary.png')
def explain(datasource, select, explainer, model_params, result_table, model): """ Do explanation to a trained TensorFlow model. Args: datasource (str): the database connection string. select (str): the input data to predict. explainer (str): the explainer to explain the model. Not used in TensorFlow models. model_params (dict): the parameters for evaluation. result_table (str): the output data table. model (Model|str): the model object or where to load the model. Returns: None. """ if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance(model, Model), "not supported model type %s" % type(model) plot_type = model_params.get("summary.plot_type", "bar") train_attributes = model.get_meta("attributes") train_fc_map = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] estimator_string = model.get_meta("class_name") save = "model_save" field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) feature_columns = compile_ir_feature_columns(train_fc_map, model.get_type()) label_name = model_params.get("label_col", train_label_desc.name) train_label_desc.name = label_name label_meta = train_label_desc.to_dict(dtype_to_string=True) if result_table: conn = db.connect_with_data_source(datasource) if estimator_string.startswith("BoostedTrees"): column_defs = [ "feature %s" % DataType.to_db_field_type(conn.driver, DataType.STRING), "dfc %s" % DataType.to_db_field_type(conn.driver, DataType.FLOAT32), "gain %s" % DataType.to_db_field_type(conn.driver, DataType.FLOAT32), ] else: selected_cols = db.selected_cols(conn, select) if label_name in selected_cols: selected_cols.remove(label_name) name_to_shape = dict([(fd.name, fd.shape) for fd in field_descs]) column_defs = [] float_field_type = DataType.to_db_field_type( conn.driver, DataType.FLOAT32) for name in selected_cols: shape = name_to_shape.get(name, None) if shape is None: raise ValueError("cannot find column %s" % name) size = int(np.prod(shape)) if size == 1: column_def = "%s %s" % (name, float_field_type) column_defs.append(column_def) else: for i in six.moves.range(size): column_def = "%s_%d %s" % (name, i, float_field_type) column_defs.append(column_def) drop_sql = "DROP TABLE IF EXISTS %s;" % result_table create_sql = "CREATE TABLE %s (%s);" % (result_table, ",".join(column_defs)) conn.execute(drop_sql) conn.execute(create_sql) conn.close() _explain(datasource=datasource, estimator_string=estimator_string, select=select, feature_columns=feature_columns, feature_column_names=feature_column_names, feature_metas=feature_metas, label_meta=label_meta, model_params=train_attributes, save=save, plot_type=plot_type, result_table=result_table) with open('summary.png', 'rb') as f: img = f.read() img = base64.b64encode(img) if six.PY3: img = img.decode('utf-8') img = "<div align='center'><img src='data:image/png;base64,%s' /></div>" \ % img print(img)
def predict_step(datasource, select, result_table, label_name, model, pai_table=None): if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance(model, Model), "not supported model type %s" % type(model) model_params = model.get_meta("attributes") train_fc_map = model.get_meta("features") label_meta = model.get_meta("label") train_label_desc = label_meta.get_field_desc()[0] if label_meta else None train_label_name = train_label_desc.name if train_label_desc else None estimator_string = model.get_meta("class_name") save = "model_save" field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) feature_columns = compile_ir_feature_columns(train_fc_map, model.get_type()) is_pai = True if pai_table else False if is_pai: select = "SELECT * FROM %s" % pai_table conn = db.connect_with_data_source(datasource) result_column_names, train_label_idx = create_predict_table( conn, select, result_table, train_label_desc, label_name) if is_pai: conn.close() conn = PaiIOConnection.from_table(pai_table) select = None selected_cols = result_column_names[0:-1] if train_label_idx >= 0: selected_cols = selected_cols[0:train_label_idx] + [ train_label_name ] + selected_cols[train_label_idx:] estimator = import_model(estimator_string) model_params.update(feature_columns) is_estimator = is_tf_estimator(estimator) predict_generator = db.db_generator(conn, select) pop_optimizer_and_loss(model_params) if not is_estimator: if not issubclass(estimator, tf.keras.Model): # functional model need field_metas parameter model_params["field_metas"] = feature_metas print("Start predicting using keras model...") keras_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, train_label_name, label_name, conn, predict_generator, selected_cols) else: model_params['model_dir'] = save print("Start predicting using estimator model...") estimator_predict(result_table, feature_column_names, feature_metas, train_label_name, label_name, conn, predict_generator, selected_cols) print("Done predicting. Predict table : %s" % result_table) conn.close()
def evaluate(datasource, select, result_table, model, pred_label_name=None, model_params=None): """ Do evaluation to a trained XGBoost model. Args: datasource (str): the database connection string. select (str): the input data to predict. result_table (str): the output data table. model (Model|str): the model object or where to load the model. pred_label_name (str): the label column name. model_params (dict): the parameters for evaluation. Returns: None. """ if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance(model, Model), "not supported model type %s" % type(model) if model_params is None: model_params = {} validation_metrics = model_params.get("validation.metrics", "Accuracy") validation_metrics = [m.strip() for m in validation_metrics.split(",")] model_params = model.get_meta("attributes") train_fc_map = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] if pred_label_name: train_label_desc.name = pred_label_name field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) # NOTE: in the current implementation, we are generating a transform_fn # from the COLUMN clause. The transform_fn is executed during the process # of dumping the original data into DMatrix SVM file. compiled_fc = compile_ir_feature_columns(train_fc_map, model.get_type()) transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *compiled_fc["feature_columns"]) bst = xgb.Booster() bst.load_model("my_model") conn = db.connect_with_data_source(datasource) result_column_names = create_evaluate_table(conn, result_table, validation_metrics) with temp_file.TemporaryDirectory() as tmp_dir_name: pred_fn = os.path.join(tmp_dir_name, "predict.txt") dpred = xgb_dataset( datasource=datasource, fn=pred_fn, dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=train_label_desc.to_dict(dtype_to_string=True), cache=True, batch_size=10000, transform_fn=transform_fn) for i, pred_dmatrix in enumerate(dpred): feature_file_name = pred_fn + "_%d" % i preds = _calc_predict_result(bst, pred_dmatrix, model_params) _store_evaluate_result(preds, feature_file_name, train_label_desc, result_table, result_column_names, validation_metrics, conn) conn.close()
def evaluate(datasource, select, result_table, model, label_name=None, model_params=None, result_column_names=[], pai_table=None): """TBD """ if model_params is None: model_params = {} validation_metrics = model_params.get("validation.metrics", "accuracy_score") validation_metrics = [m.strip() for m in validation_metrics.split(",")] bst = xgb.Booster() if isinstance(model, six.string_types): with temp_file.TemporaryDirectory(as_cwd=True): model = Model.load_from_db(datasource, model) bst.load_model("my_model") else: assert isinstance(model, Model), "not supported model type %s" % type(model) bst.load_model("my_model") model_params = model.get_meta("attributes") fc_map_ir = model.get_meta("features") train_label = model.get_meta("label") train_label_desc = train_label.get_field_desc()[0] if label_name: train_label_desc.name = label_name feature_columns = compile_ir_feature_columns(fc_map_ir, EstimatorType.XGBOOST) field_descs = get_ordered_field_descs(fc_map_ir) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) transform_fn = ComposedColumnTransformer( feature_column_names, *feature_columns["feature_columns"]) is_pai = True if pai_table else False if is_pai: conn = PaiIOConnection.from_table(pai_table) else: conn = db.connect_with_data_source(datasource) with temp_file.TemporaryDirectory() as tmp_dir_name: pred_fn = os.path.join(tmp_dir_name, "predict.txt") dpred = xgb_dataset( datasource=datasource, fn=pred_fn, dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=train_label_desc.to_dict(dtype_to_string=True), cache=True, batch_size=10000, transform_fn=transform_fn, is_pai=is_pai, pai_table=pai_table, pai_single_file=True, feature_column_code=fc_map_ir) for i, pred_dmatrix in enumerate(dpred): if is_pai: feature_file_name = pred_fn else: feature_file_name = pred_fn + "_%d" % i preds = _calc_predict_result(bst, pred_dmatrix, model_params) _store_evaluate_result(preds, feature_file_name, train_label_desc, result_table, result_column_names, validation_metrics, conn) conn.close()
def evaluate(datasource, select, result_table, load, pred_label_name=None, validation_metrics=["accuracy_score"]): """ Do evaluation to a trained XGBoost model. Args: datasource (str): the database connection string. select (str): the input data to predict. result_table (str): the output data table. load (str): where the trained model stores. pred_label_name (str): the label column name. validation_metrics (list[str]): the evaluation metric names. Returns: None. """ model = Model.load_from_db(datasource, load) model_params = model.get_meta("attributes") train_fc_map = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] if pred_label_name: train_label_desc.name = pred_label_name field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict()) for fd in field_descs]) # NOTE: in the current implementation, we are generating a transform_fn # from the COLUMN clause. The transform_fn is executed during the process # of dumping the original data into DMatrix SVM file. compiled_fc = compile_ir_feature_columns(train_fc_map, model.get_type()) transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *compiled_fc["feature_columns"]) bst = xgb.Booster() bst.load_model("my_model") conn = db.connect_with_data_source(datasource) result_column_names = _create_evaluate_table(conn, result_table, validation_metrics) with temp_file.TemporaryDirectory() as tmp_dir_name: pred_fn = os.path.join(tmp_dir_name, "predict.txt") dpred = xgb_dataset(datasource=datasource, fn=pred_fn, dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=train_label_desc.to_dict(), cache=True, batch_size=10000, transform_fn=transform_fn) for i, pred_dmatrix in enumerate(dpred): feature_file_name = pred_fn + "_%d" % i preds = _calc_predict_result(bst, pred_dmatrix, model_params) _store_evaluate_result(preds, feature_file_name, train_label_desc, result_table, result_column_names, validation_metrics, conn) conn.close()
def evaluate(datasource, select, result_table, model, pred_label_name=None, model_params=None): """ Do evaluation to a trained TensorFlow model. Args: datasource (str): the database connection string. select (str): the input data to predict. result_table (str): the output data table. model (Model|str): the model object or where to load the model. pred_label_name (str): the label column name. model_params (dict): the parameters for evaluation. Returns: None. """ if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance(model, Model), "not supported model type %s" % type(model) validation_metrics = model_params.get("validation.metrics", "Accuracy") validation_metrics = [m.strip() for m in validation_metrics.split(',')] validation_steps = model_params.get("validation.steps", None) batch_size = model_params.get("validation.batch_size", 1) verbose = model_params.get("validation.verbose", 0) conn = db.connect_with_data_source(datasource) create_evaluate_table(conn, result_table, validation_metrics) conn.close() model_params = model.get_meta("attributes") train_fc_map = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] estimator_string = model.get_meta("class_name") save = "model_save" field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) feature_columns = compile_ir_feature_columns(train_fc_map, model.get_type()) train_label_desc.name = pred_label_name label_meta = train_label_desc.to_dict(dtype_to_string=True) _evaluate(datasource=datasource, estimator_string=estimator_string, select=select, result_table=result_table, feature_columns=feature_columns, feature_column_names=feature_column_names, feature_metas=feature_metas, label_meta=label_meta, model_params=model_params, validation_metrics=validation_metrics, save=save, batch_size=batch_size, validation_steps=validation_steps, verbose=verbose)