def shap_explain(booster, datasource, dataset, summary_params, result_table="", is_pai=False, oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None): tree_explainer = shap.TreeExplainer(booster) shap_values = tree_explainer.shap_values(dataset) if result_table: if is_pai: conn = PaiIOConnection.from_table(result_table) else: conn = db.connect_with_data_source(datasource) # TODO(typhoonzero): the shap_values is may be a # list of shape [3, num_samples, num_features], # use the first dimension here, should find out # when to use the other two. When shap_values is # not a list it can be directly used. if isinstance(shap_values, list): to_write = shap_values[0] else: to_write = shap_values columns = list(dataset.columns) with db.buffered_db_writer(conn, result_table, columns) as w: for row in to_write: w.write(list(row)) conn.close() if summary_params.get("plot_type") == "decision": shap_interaction_values = tree_explainer.shap_interaction_values( dataset) expected_value = tree_explainer.expected_value if isinstance(shap_interaction_values, list): shap_interaction_values = shap_interaction_values[0] if isinstance(expected_value, list): expected_value = expected_value[0] plot_func = lambda: shap.decision_plot( # noqa: E731 expected_value, shap_interaction_values, dataset, show=False, feature_display_range=slice(None, -40, -1), alpha=1) else: plot_func = lambda: shap.summary_plot( # noqa: E731 shap_values, dataset, show=False, **summary_params) explainer.plot_and_save(plot_func, oss_dest=oss_dest, oss_ak=oss_ak, oss_sk=oss_sk, oss_endpoint=oss_endpoint, oss_bucket_name=oss_bucket_name, filename='summary')
def test_field_type(self): self.assertGreater(len(MYSQL_FIELD_TYPE_DICT), 0) addr = os.getenv("SQLFLOW_TEST_DB_MYSQL_ADDR", "localhost:3306") conn = connect_with_data_source( "mysql://*****:*****@tcp(%s)/?maxAllowedPacket=0" % addr) cursor = conn.cursor() table_name = "iris.test_mysql_field_type_table" drop_table_sql = "DROP TABLE IF EXISTS %s" % table_name create_table_sql = "CREATE TABLE IF NOT EXISTS " + \ table_name + "(a %s)" select_sql = "SELECT * FROM %s" % table_name for int_type, str_type in MYSQL_FIELD_TYPE_DICT.items(): if str_type in ["VARCHAR", "CHAR"]: str_type += "(255)" cursor.execute(drop_table_sql) cursor.execute(create_table_sql % str_type) cursor.execute(select_sql) int_type_actual = cursor.description[0][1] cursor.execute(drop_table_sql) self.assertEqual(int_type_actual, int_type, "%s not match" % str_type)
def load_db_data_to_data_frame(datasource, select): """ Load database data to a pandas.DataFrame. Args: datasource (str): the database connection URI. select (str): the select SQL statement. Returns: A pandas.DataFrame object which contains all queried data. """ conn = db.connect_with_data_source(datasource) generator = verifier.fetch_samples(conn, select, n=-1) names = generator.field_names dtypes = [] for dtype in generator.field_types: if dtype in ['VARCHAR', 'CHAR', 'TEXT', 'STRING']: dtypes.append(np.str) else: dtypes.append(np.float64) df = pd.DataFrame(columns=names) for i, rows in enumerate(generator()): df.loc[i] = rows for name, dtype in zip(names, dtypes): df[name] = df[name].astype(dtype) conn.close() return df
def write_with_generator(datasource, table, gen): """Write data into a table, the written data comes from the input generator. Args: datasource: string The connection string to connectDBMS. table: string The table name written. gen: Generator The generator to generte the data to insert into table. """ conn = connect_with_data_source(datasource) _drop_table_if_exists(conn, table) _create_table(conn, table) idx = 0 with buffered_db_writer(conn, table, ["id", "block"]) as w: for d in gen(): block = base64.b64encode(d) row = [idx, block] w.write(row) idx += 1 conn.close()
def save_solved_result_in_db(solved_result, data_frame, variables, result_value_name, datasource, result_table): column_names = [] for col in data_frame.columns: found = False for var in variables: if var.lower() == col.lower(): found = True break if found: column_names.append(col) data_frame = data_frame[[*column_names]] if len(variables) == 1 and variables[0].lower() == result_value_name.lower( ): result_value_name += "_value" column_names.append(result_value_name) data_frame[result_value_name] = solved_result conn = db.connect_with_data_source(datasource) with db.buffered_db_writer(conn.driver, conn, result_table, column_names) as w: for i in six.moves.range(len(data_frame)): rows = list(data_frame.loc[i]) w.write(rows) print('Solved result is:') print(data_frame) print('Saved in {}.'.format(result_table))
def submit_local_evaluate(datasource, original_sql, select, label_name, model, model_params, result_table, user=""): model = Model.load_from_db(datasource, model) if model.get_type() == EstimatorType.XGBOOST: evaluate_func = xgboost_evaluate validation_metrics = model_params.get("validation.metrics", "accuracy_score") else: evaluate_func = tf_evaluate validation_metrics = model_params.get("validation.metrics", "Accuracy") conn = db.connect_with_data_source(datasource) validation_metrics = [m.strip() for m in validation_metrics.split(",")] result_column_names = create_evaluate_table(conn, result_table, validation_metrics) conn.close() evaluate_func(datasource=datasource, select=select, result_table=result_table, model=model, label_name=label_name, model_params=model_params, result_column_names=result_column_names)
def xgb_native_explain(booster, datasource, result_table): if not result_table: raise ValueError( "XGBoostExplainer must use with INTO to output result to a table.") gain_map = booster.get_score(importance_type="gain") fscore_map = booster.get_fscore() conn = db.connect_with_data_source(datasource) all_feature_keys = list(gain_map.keys()) all_feature_keys.sort() columns = ["feature", "fscore", "gain"] dtypes = [ DataType.to_db_field_type(conn.driver, DataType.STRING), DataType.to_db_field_type(conn.driver, DataType.FLOAT32), DataType.to_db_field_type(conn.driver, DataType.FLOAT32), ] _create_table(conn, result_table, columns, dtypes) with db.buffered_db_writer(conn, result_table, columns) as w: for fkey in all_feature_keys: row = [fkey, fscore_map[fkey], gain_map[fkey]] w.write(list(row)) conn.close()
def submit_local_explain(datasource, original_sql, select, model, model_params, result_table, explainer="TreeExplainer", user=""): model = Model.load_from_db(datasource, model) if model.get_type() == EstimatorType.XGBOOST: explain_func = xgboost_explain else: explain_func = tf_explain if result_table: feature_columns = model.get_meta("features") estimator_string = model.get_meta("class_name") field_descs = get_ordered_field_descs(feature_columns) feature_column_names = [fd.name for fd in field_descs] with db.connect_with_data_source(datasource) as conn: create_explain_table(conn, model.get_type(), explainer, estimator_string, result_table, feature_column_names) explain_func(datasource=datasource, select=select, explainer=explainer, model_params=model_params, result_table=result_table, model=model) if not result_table: print_image_as_base64_html("summary.png")
def test_hive(self): driver = os.environ.get('SQLFLOW_TEST_DB') if driver == "hive": host = "127.0.0.1" port = "10000" conn = connect(driver, "iris", user="******", password="******", host=host, port=port) self._do_test(driver, conn, hdfs_namenode_addr="127.0.0.1:8020", hive_location="/sqlflow") conn.close() conn = connect_with_data_source( "hive://*****:*****@127.0.0.1:10000/iris") self._do_test(driver, conn) self._do_test_hive_specified_db( driver, conn, hdfs_namenode_addr="127.0.0.1:8020", hive_location="/sqlflow") conn.close()
def read_with_generator(datasource, table): """Read data from a table, this function returns a generator to yield the data. Args: datasource: string The connection string to connect DBMS. table: string The table name read. Returns: Generator the generator yield row data of the table. """ conn = connect_with_data_source(datasource) sql = "SELECT id, block FROM {0} ORDER BY id".format(table) cursor = conn.cursor() cursor.execute(sql) fetch_size = 100 def reader(): while True: rows = cursor.fetchmany(size=fetch_size) if not rows: break for r in rows: yield base64.b64decode(r[1]) conn.close() return reader
def get_explain_random_forest_pai_cmd(datasource, model_name, data_table, result_table, label_column): """Get a command to submit a PAI RandomForest explain task Args: datasource: current datasoruce model_name: model name on PAI data_table: input data table name result_table: name of the result table, PAI will automatically create this table label_column: name of the label column Returns: A string which is a PAI cmd """ # NOTE(typhoonzero): for PAI random forests predicting, we can not load # the TrainStmt since the model saving is fully done by PAI. We directly # use the columns in SELECT statement for prediction, error will be # reported by PAI job if the columns not match. if not label_column: return ("must specify WITH label_column when using " "pai random forest to explain models") conn = db.connect_with_data_source(datasource) schema = db.get_table_schema(conn, data_table) columns = [f[0] for f in schema] conn.execute("DROP TABLE IF EXISTS %s;" % result_table) return ( """pai -name feature_importance -project algo_public """ """-DmodelName="%s" -DinputTableName="%s" -DoutputTableName="%s" """ """-DlabelColName="%s" -DfeatureColNames="%s" """ ) % (model_name, data_table, result_table, label_column, ",".join(columns))
def get_explain_random_forests_cmd(datasource, model_name, data_table, result_table, label_column): """Get PAI random forest explanation command Args: datasource: current datasoruce model_name: model name on PAI data_table: input data table name result_table: result table name label_column: name of the label column Returns: a PAI cmd to explain the data using given model """ # NOTE(typhoonzero): for PAI random forests predicting, we can not load # the TrainStmt since the model saving is fully done by PAI. We directly # use the columns in SELECT statement for prediction, error will be # reported by PAI job if the columns not match. if not label_column: raise SQLFlowDiagnostic("must specify WITH label_column when using " "pai random forest to explain models") conn = db.connect_with_data_source(datasource) # drop result table if exists conn.execute("DROP TABLE IF EXISTS %s;" % result_table) schema = db.get_table_schema(conn, data_table) fields = [f[0] for f in schema if f[0] != label_column] return ('''pai -name feature_importance -project algo_public ''' '''-DmodelName="%s" -DinputTableName="%s" ''' '''-DoutputTableName="%s" -DlabelColName="%s" ''' '''-DfeatureColNames="%s" ''') % (model_name, data_table, result_table, label_column, ",".join(fields))
def test_field_type(self): self.assertGreater(len(MYSQL_FIELD_TYPE_DICT), 0) conn = connect_with_data_source(testing.get_datasource()) table_name = "iris.test_mysql_field_type_table" drop_table_sql = "DROP TABLE IF EXISTS %s" % table_name create_table_sql = "CREATE TABLE IF NOT EXISTS " + \ table_name + "(a %s)" select_sql = "SELECT * FROM %s" % table_name for int_type, str_type in MYSQL_FIELD_TYPE_DICT.items(): if str_type in ["VARCHAR", "CHAR"]: str_type += "(255)" conn.execute(drop_table_sql) conn.execute(create_table_sql % str_type) # we are meant to use low layer cursor here to # check the type value with the real value returned by mysql cursor = conn.cursor() cursor.execute(select_sql) int_type_actual = cursor.description[0][1] cursor.close() conn.execute(drop_table_sql) self.assertEqual(int_type_actual, int_type, "%s not match" % str_type)
def _create_result_table(datasource, select, variables, result_value_name, variable_type, result_table): if variable_type.endswith('Integers') or variable_type == "Binary": result_type = DataType.INT64 elif variable_type.endswith('Reals'): result_type = DataType.FLOAT32 else: raise ValueError("unsupported variable type %s" % variable_type) conn = db.connect_with_data_source(datasource) name_and_types = dict(db.selected_columns_and_types(conn, select)) columns = [] for var in variables: field_type = db.to_db_field_type(conn.driver, name_and_types.get(var)) columns.append("%s %s" % (var, field_type)) if len(variables) == 1 and variables[0].lower() == result_value_name.lower( ): result_value_name += "_value" columns.append("%s %s" % (result_value_name, DataType.to_db_field_type(conn.driver, result_type))) column_str = ",".join(columns) conn.execute("DROP TABLE IF EXISTS %s" % result_table) create_sql = "CREATE TABLE %s (%s)" % (result_table, column_str) conn.execute(create_sql) conn.close()
def run_optimize(datasource, select, variables, result_value_name, variable_type, objective, direction, constraints, solver, result_table, submitter, user_id): if submitter == "local": _create_result_table(datasource, select, variables, result_value_name, variable_type, result_table) return run_optimize_locally(datasource=datasource, select=select, variables=variables, variable_type=variable_type, result_value_name=result_value_name, objective=objective, direction=direction, constraints=constraints, solver=solver, result_table=result_table) else: with create_tmp_tables_guard(select, datasource) as train_table: with db.connect_with_data_source(datasource) as conn: schema = conn.get_table_schema(train_table) columns = [s[0] for s in schema] conn.execute("DROP TABLE IF EXISTS %s;" % result_table) return run_optimize_on_optflow(train_table=train_table, columns=columns, variables=variables, variable_type=variable_type, result_value_name=result_value_name, objective=objective, direction=direction, constraints=constraints, solver=solver, result_table=result_table, user_id=user_id)
def shap_explain(booster, datasource, dataset, summary_params, result_table): tree_explainer = shap.TreeExplainer(booster) shap_values = tree_explainer.shap_values(dataset) if result_table: conn = db.connect_with_data_source(datasource) # TODO(typhoonzero): the shap_values is may be a # list of shape [3, num_samples, num_features], # use the first dimension here, should find out # when to use the other two. When shap_values is # not a list it can be directly used. if isinstance(shap_values, list): to_write = shap_values[0] else: to_write = shap_values columns = list(dataset.columns) dtypes = [DataType.to_db_field_type(conn.driver, DataType.FLOAT32) ] * len(columns) _create_table(conn, result_table, columns, dtypes) with db.buffered_db_writer(conn, result_table, columns) as w: for row in to_write: w.write(list(row)) conn.close() if summary_params.get("plot_type") == "decision": shap_interaction_values = tree_explainer.shap_interaction_values( dataset) expected_value = tree_explainer.expected_value if isinstance(shap_interaction_values, list): shap_interaction_values = shap_interaction_values[0] if isinstance(expected_value, list): expected_value = expected_value[0] plot_func = lambda: shap.decision_plot( # noqa: E731 expected_value, shap_interaction_values, dataset, show=False, feature_display_range=slice(None, -40, -1), alpha=1) else: plot_func = lambda: shap.summary_plot( # noqa: E731 shap_values, dataset, show=False, **summary_params) filename = 'summary.png' with temp_file.TemporaryDirectory(as_cwd=True): explainer.plot_and_save(plot_func, filename=filename) with open(filename, 'rb') as f: img = f.read() img = base64.b64encode(img) if six.PY3: img = img.decode('utf-8') img = "<div align='center'><img src='data:image/png;base64,%s' /></div>" \ % img print(img)
def input_fn(select, datasource, feature_column_names, feature_metas, label_meta, is_pai=False, pai_table="", num_workers=1, worker_id=0): feature_types = [] shapes = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: if feature_metas[name]["delimiter_kv"]: # extract two features from generator data. feature_types.append( (get_dtype(feature_metas[name]["dtype"]), get_dtype(feature_metas[name]["dtype_weight"]), tf.int64)) shapes.append((None, None, None)) else: feature_types.append((tf.int64, tf.int32, tf.int64)) shapes.append((None, None, None)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) shapes.append(feature_metas[name]["shape"]) if is_pai: pai_table = "odps://{}/tables/{}".format(*pai_table.split(".")) return pai_dataset(pai_table, feature_column_names, label_meta, feature_metas, slice_id=worker_id, slice_count=num_workers) else: conn = db.connect_with_data_source(datasource) gen = db.db_generator(conn, select, label_meta) selected_cols = db.selected_cols(conn, select) gen = tf_generator(gen, selected_cols, feature_column_names, feature_metas) # Clustering model do not have label if not label_meta or label_meta["feature_name"] == "": dataset = tf.data.Dataset.from_generator(gen, (tuple(feature_types), ), (tuple(shapes), )) ds_mapper = functools.partial( parse_sparse_feature_predict, feature_column_names=feature_column_names, feature_metas=feature_metas) else: dataset = tf.data.Dataset.from_generator( gen, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"])), (tuple(shapes), label_meta["shape"])) ds_mapper = functools.partial( parse_sparse_feature, feature_column_names=feature_column_names, feature_metas=feature_metas) return dataset.map(ds_mapper)
def evaluate(datasource, estimator_string, select, result_table, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, validation_metrics=["Accuracy"], save="", batch_size=1, validation_steps=None, verbose=0, pai_table=""): FLAGS = define_tf_flags() set_oss_environs(FLAGS) estimator_cls = import_model(estimator_string) is_estimator = is_tf_estimator(estimator_cls) set_log_level(verbose, is_estimator) is_pai = True if pai_table else False eval_dataset = get_dataset_fn(select, datasource, feature_column_names, feature_metas, label_meta, is_pai=is_pai, pai_table=pai_table, batch_size=batch_size) model_params.update(feature_columns) pop_optimizer_and_loss(model_params) if is_estimator: with open("exported_path", "r") as fid: exported_path = str(fid.read()) model_params["warm_start_from"] = exported_path estimator = estimator_cls(**model_params) result_metrics = estimator_evaluate(estimator, eval_dataset, validation_metrics) else: keras_model = init_model_with_feature_column(estimator_cls, model_params) keras_model_pkg = sys.modules[estimator_cls.__module__] result_metrics = keras_evaluate(keras_model, eval_dataset, save, keras_model_pkg, validation_metrics) if result_table: metric_name_list = ["loss"] + validation_metrics if is_pai: conn = PaiIOConnection.from_table(result_table) else: conn = db.connect_with_data_source(datasource) write_result_metrics(result_metrics, metric_name_list, result_table, conn) conn.close()
def pred(datasource, select, feature_metas, feature_column_names, train_label_meta, pred_label_meta, result_table, is_pai=False, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass="", pai_table="", model_params=None, train_params=None, transform_fn=None, feature_column_code=""): if not is_pai: conn = db.connect_with_data_source(datasource) else: conn = None dpred = xgb_dataset( datasource=datasource, fn='predict.txt', dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=None, is_pai=is_pai, pai_table=pai_table, pai_single_file=True, cache=True, batch_size=DEFAULT_PREDICT_BATCH_SIZE, transform_fn=transform_fn, feature_column_code=feature_column_code, raw_data_dir="predict.raw.dir") # NOTE: default to use external memory bst = xgb.Booster({'nthread': 4}) # init model bst.load_model("my_model") # load data print("Start predicting XGBoost model...") if is_pai: pai_table = "odps://{}/tables/{}".format(*pai_table.split(".")) selected_cols = db.pai_selected_cols(pai_table) else: selected_cols = db.selected_cols(conn, select) feature_file_id = 0 train_label_name = train_label_meta["feature_name"] pred_label_name = pred_label_meta["feature_name"] for pred_dmatrix in dpred: predict_and_store_result(bst, pred_dmatrix, feature_file_id, model_params, selected_cols, train_label_name, pred_label_name, feature_column_names, feature_metas, is_pai, conn, result_table, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) feature_file_id += 1 print("Done predicting. Predict table : %s" % result_table)
def shap_explain(datasource, select, feature_field_meta, feature_column_names, label_meta, summary_params, result_table="", is_pai=False, pai_explain_table="", oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None, transform_fn=None, feature_column_code=""): x = xgb_shap_dataset(datasource, select, feature_column_names, label_meta, feature_field_meta, is_pai, pai_explain_table, transform_fn=transform_fn, feature_column_code=feature_column_code) shap_values, shap_interaction_values, expected_value = xgb_shap_values(x) if result_table != "": if is_pai: from runtime.dbapi.paiio import PaiIOConnection conn = PaiIOConnection.from_table(result_table) else: conn = db.connect_with_data_source(datasource) # TODO(typhoonzero): the shap_values is may be a # list of shape [3, num_samples, num_features], # use the first dimension here, should find out # when to use the other two. When shap_values is # not a list it can be directly used. if isinstance(shap_values, list): to_write = shap_values[0] else: to_write = shap_values write_shap_values(to_write, conn, result_table, feature_column_names) if summary_params.get("plot_type") == "decision": explainer.plot_and_save( lambda: shap.decision_plot(expected_value, shap_interaction_values, x, show=False, feature_display_range=slice( None, -40, -1), alpha=1), oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name) else: explainer.plot_and_save( lambda: shap.summary_plot( shap_values, x, show=False, **summary_params), oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name)
def pred_imp(datasource, select, feature_metas, feature_column_names, train_label_meta, pred_label_meta, result_table, is_pai=False, pai_table="", model_params=None, train_params=None, transform_fn=None, feature_column_code="", rank=0, nworkers=1): print("rank={} nworkers={}".format(rank, nworkers)) if not is_pai: conn = db.connect_with_data_source(datasource) else: conn = PaiIOConnection.from_table(pai_table) dpred = xgb_dataset( datasource=datasource, fn='predict.txt', dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=None, is_pai=is_pai, pai_table=pai_table, pai_single_file=True, cache=True, batch_size=DEFAULT_PREDICT_BATCH_SIZE, rank=rank, nworkers=nworkers, transform_fn=transform_fn, feature_column_code=feature_column_code, raw_data_dir="predict.raw.dir") # NOTE: default to use external memory bst = xgb.Booster({'nthread': 4}) # init model bst.load_model("my_model") # load data print("{} Start predicting XGBoost model...".format(datetime.now())) if not model_params: model_params = load_metadata("model_meta.json")["attributes"] selected_cols = db.selected_cols(conn, select) feature_file_id = 0 train_label_name = train_label_meta["feature_name"] pred_label_name = pred_label_meta["feature_name"] for pred_dmatrix in dpred: predict_and_store_result(bst, pred_dmatrix, feature_file_id, model_params, selected_cols, train_label_name, pred_label_name, feature_column_names, feature_metas, is_pai, conn, result_table, rank) feature_file_id += 1 print("{} Done predicting. Predict table: {}".format( datetime.now(), result_table))
def evaluate(datasource, estimator_string, select, result_table, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, validation_metrics=["Accuracy"], save="", batch_size=1, validation_steps=None, verbose=0, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass=""): estimator_cls = import_model(estimator_string) is_estimator = is_tf_estimator(estimator_cls) set_log_level(verbose, is_estimator) eval_dataset = get_dataset_fn(select, datasource, feature_column_names, feature_metas, label_meta, is_pai=False, pai_table="", batch_size=batch_size) model_params.update(feature_columns) if is_estimator: model_params["model_dir"] = save estimator = estimator_cls(**model_params) result_metrics = estimator_evaluate(estimator, eval_dataset, validation_metrics) else: keras_model = init_model_with_feature_column(estimator_cls, model_params) keras_model_pkg = sys.modules[estimator_cls.__module__] result_metrics = keras_evaluate(keras_model, eval_dataset, save, keras_model_pkg, validation_metrics) # write result metrics to a table conn = connect_with_data_source(datasource) driver = conn.driver if result_table: metric_name_list = ["loss"] + validation_metrics write_result_metrics(result_metrics, metric_name_list, result_table, driver, conn, hdfs_namenode_addr=hdfs_namenode_addr, hive_location=hive_location, hdfs_user=hdfs_user, hdfs_pass=hdfs_pass)
def explain(datasource, select, feature_field_meta, feature_column_names, label_meta, summary_params, explainer="TreeExplainer", result_table="", is_pai=False, pai_explain_table="", oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None, transform_fn=None, feature_column_code=""): if explainer == "XGBoostExplainer": if result_table == "": raise ValueError("""XGBoostExplainer must use with INTO to output result to a table.""") bst = xgb.Booster() bst.load_model("my_model") gain_map = bst.get_score(importance_type="gain") fscore_map = bst.get_fscore() if is_pai: from runtime.dbapi.paiio import PaiIOConnection conn = PaiIOConnection.from_table(result_table) else: conn = db.connect_with_data_source(datasource) all_feature_keys = list(gain_map.keys()) all_feature_keys.sort() with db.buffered_db_writer(conn, result_table, ["feature", "fscore", "gain"], 100) as w: for fkey in all_feature_keys: row = [fkey, fscore_map[fkey], gain_map[fkey]] w.write(list(row)) else: # when explainer is "" or "TreeExplainer" use SHAP by default. shap_explain(datasource, select, feature_field_meta, feature_column_names, label_meta, summary_params, result_table=result_table, is_pai=is_pai, pai_explain_table=pai_explain_table, oss_dest=oss_dest, oss_ak=oss_ak, oss_sk=oss_sk, oss_endpoint=oss_endpoint, oss_bucket_name=oss_bucket_name, transform_fn=transform_fn, feature_column_code=feature_column_code)
def evaluate_step(datasource, select, result_table, model, label_name, model_params, pai_table=None): if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance(model, Model), "not supported model type %s" % type(model) if model_params is None: model_params = {} validation_metrics = model_params.get("validation.metrics", "Accuracy") validation_metrics = [m.strip() for m in validation_metrics.split(',')] validation_steps = model_params.get("validation.steps", None) batch_size = model_params.get("validation.batch_size", 1) verbose = model_params.get("validation.verbose", 0) conn = db.connect_with_data_source(datasource) create_evaluate_table(conn, result_table, validation_metrics) conn.close() model_params = model.get_meta("attributes") train_fc_map = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] estimator_string = model.get_meta("class_name") save = "model_save" field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) feature_columns = compile_ir_feature_columns(train_fc_map, model.get_type()) train_label_desc.name = label_name label_meta = train_label_desc.to_dict(dtype_to_string=True) _evaluate(datasource=datasource, estimator_string=estimator_string, select=select, result_table=result_table, feature_columns=feature_columns, feature_column_names=feature_column_names, feature_metas=feature_metas, label_meta=label_meta, model_params=model_params, validation_metrics=validation_metrics, save=save, batch_size=batch_size, validation_steps=validation_steps, verbose=verbose, pai_table=pai_table)
def pred(datasource, estimator_string, select, result_table, feature_columns, feature_column_names, feature_column_names_map, train_label_name, result_col_name, feature_metas={}, model_params={}, pred_params={}, save="", batch_size=1, pai_table=""): estimator = import_model(estimator_string) model_params.update(feature_columns) is_estimator = is_tf_estimator(estimator) if pai_table != "": conn = PaiIOConnection.from_table(pai_table) selected_cols = db.selected_cols(conn, None) predict_generator = db.db_generator(conn, None) else: conn = db.connect_with_data_source(datasource) selected_cols = db.selected_cols(conn, select) predict_generator = db.db_generator(conn, select) pop_optimizer_and_loss(model_params) if pred_params is None: extra_result_cols = [] else: extra_result_cols = pred_params.get("extra_outputs", "") extra_result_cols = [ c.strip() for c in extra_result_cols.split(",") if c.strip() ] if not is_estimator: if not issubclass(estimator, tf.keras.Model): # functional model need field_metas parameter model_params["field_metas"] = feature_metas print("Start predicting using keras model...") keras_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, train_label_name, result_col_name, conn, predict_generator, selected_cols, extra_result_cols) else: # TODO(sneaxiy): support extra_result_cols for estimator model_params['model_dir'] = save print("Start predicting using estimator model...") estimator_predict(result_table, feature_column_names, feature_metas, train_label_name, result_col_name, conn, predict_generator, selected_cols) print("Done predicting. Predict table : %s" % result_table)
def drop_tables(tables, datasource): """Drop given tables in datasource""" conn = db.connect_with_data_source(datasource) try: for table in tables: if table != "": drop_sql = "DROP TABLE IF EXISTS %s" % table conn.execute(drop_sql) except: # noqa: E722 # odps will clear table itself, so even fail here, we do # not need to raise error print("Encounter error on drop tmp table")
def create_explain_result_table(datasource, data_table, result_table, model_type, estimator, label_column): """Create explain result table from given datasource Args: datasource: current datasource data_table: input data table name result_table: table name to store the result model_type: type of the model to use estimator: estimator class if the model is TensorFlow estimator label_column: column name of the predict label """ conn = db.connect_with_data_source(datasource) drop_stmt = "DROP TABLE IF EXISTS %s" % result_table conn.execute(drop_stmt) create_stmt = "" if model_type == EstimatorType.PAIML: return elif model_type == EstimatorType.TENSORFLOW: if estimator.startswith("BoostedTrees"): column_def = "" if conn.driver == "mysql": column_def = "(feature VARCHAR(255), dfc FLOAT, gain FLOAT)" else: # Hive & MaxCompute column_def = "(feature STRING, dfc STRING, gain STRING)" create_stmt = "CREATE TABLE IF NOT EXISTS %s %s;" % (result_table, column_def) else: if not label_column: raise SQLFlowDiagnostic( "need to specify WITH label_col=lable_col_name " "when explaining deep models") create_stmt = get_create_shap_result_sql(conn, data_table, result_table, label_column) elif model_type == EstimatorType.XGBOOST: if not label_column: raise SQLFlowDiagnostic( "need to specify WITH label_col=lable_col_name " "when explaining xgboost models") create_stmt = get_create_shap_result_sql(conn, data_table, result_table, label_column) else: raise SQLFlowDiagnostic( "not supported modelType %d for creating Explain result table" % model_type) if not conn.execute(create_stmt): raise SQLFlowDiagnostic("Can't create explain result table")
def test_mysql(self): driver = os.environ.get('SQLFLOW_TEST_DB') if driver == "mysql": user, password, host, port, database = testing_mysql_cfg() conn = connect(driver, database, user=user, password=password, host=host, port=port) self._do_test(driver, conn) conn = connect_with_data_source(testing_mysql_db_url()) self._do_test(driver, conn)
def explain(datasource, estimator_string, select, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, save="", pai_table="", plot_type='bar', result_table="", hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass="", oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None): estimator_cls = import_model(estimator_string) model_params['model_dir'] = save model_params.update(feature_columns) def _input_fn(): dataset = input_fn(select, datasource, feature_column_names, feature_metas, label_meta) return dataset.batch(1).cache() estimator = init_model_with_feature_column(estimator_cls, model_params) conn = connect_with_data_source(datasource) if estimator_cls in (tf.estimator.BoostedTreesClassifier, tf.estimator.BoostedTreesRegressor): explain_boosted_trees(datasource, estimator, _input_fn, plot_type, result_table, feature_column_names, conn.driver, conn, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name) else: shap_dataset = pd.DataFrame(columns=feature_column_names) for i, (features, label) in enumerate(_input_fn()): shap_dataset.loc[i] = [ item.numpy()[0][0] for item in features.values() ] explain_dnns(datasource, estimator, shap_dataset, plot_type, result_table, feature_column_names, conn.driver, conn, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name)
def read_with_generator_and_metadata(datasource, table, buff_size=256, meta_only=False): """Read data from a table, this function returns a generator to yield the data, and the metadata dict. Args: datasource: string The connection string to connect DBMS. table: string The table name read. buff_size: int The buffer size to read data. meta_only: bool Only read the metadata. Returns: tuple(Generator, dict) the generator yield row data of the table, and the model metadata dict. """ conn = connect_with_data_source(datasource) r = SQLFSReader(conn, table, not meta_only) metadata = _read_metadata(r) if meta_only: r.close() return None, metadata def reader(): try: while True: buffer = r.read(buff_size) if not buffer: break yield buffer finally: reader.close() def close(): if not reader.is_closed: r.close() conn.close() reader.is_closed = True reader.is_closed = False reader.close = close return reader, metadata