def _do_test(self, driver, conn, hdfs_namenode_addr="", hive_location=""): table_name = "test_db" table_schema = ["label", "features"] values = [(1, '5,6,1,2')] * 10 execute(driver, conn, self.drop_statement) if driver == "hive": execute(driver, conn, self.hive_create_statement) else: execute(driver, conn, self.create_statement) with buffered_db_writer(driver, conn, table_name, table_schema, buff_size=10, hdfs_namenode_addr=hdfs_namenode_addr, hive_location=hive_location) as w: for row in values: w.write(row) field_names, data = execute(driver, conn, self.select_statement) expect_features = ['5,6,1,2'] * 10 expect_labels = [1] * 10 self.assertEqual(field_names, ['features', 'label']) self.assertEqual(expect_features, data[0]) self.assertEqual(expect_labels, data[1])
def save_solved_result_in_db(solved_result, data_frame, variables, result_value_name, datasource, result_table): column_names = [] for col in data_frame.columns: found = False for var in variables: if var.lower() == col.lower(): found = True break if found: column_names.append(col) data_frame = data_frame[[*column_names]] if len(variables) == 1 and variables[0].lower() == result_value_name.lower( ): result_value_name += "_value" column_names.append(result_value_name) data_frame[result_value_name] = solved_result conn = db.connect_with_data_source(datasource) with db.buffered_db_writer(conn.driver, conn, result_table, column_names) as w: for i in six.moves.range(len(data_frame)): rows = list(data_frame.loc[i]) w.write(rows) print('Solved result is:') print(data_frame) print('Saved in {}.'.format(result_table))
def write_dfc_result(dfc_mean, gain, result_table, driver, conn, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass): with buffered_db_writer(conn, result_table, ["feature", "dfc", "gain"], 100) as w: for row_name in feature_column_names: w.write([row_name, dfc_mean.loc[row_name], gain[row_name]])
def _do_test_hive_specified_db(self, driver, conn, hdfs_namenode_addr="", hive_location=""): create_db = '''create database if not exists test_db''' create_tbl = '''create table test_db.tbl (features string, label int) ROW FORMAT DELIMITED FIELDS TERMINATED BY "\001"''' drop_tbl = '''drop table if exists test_db.tbl''' select_tbl = '''select * from test_db.tbl''' table_schema = ["label", "features"] values = [(1, '5,6,1,2')] * 10 execute(driver, conn, create_db) execute(driver, conn, drop_tbl) execute(driver, conn, create_tbl) with buffered_db_writer(driver, conn, "test_db.tbl", table_schema, buff_size=10, hdfs_namenode_addr=hdfs_namenode_addr, hive_location=hive_location) as w: for row in values: w.write(row) field_names, data = execute(driver, conn, select_tbl) expect_features = ['5,6,1,2'] * 10 expect_labels = [1] * 10 self.assertEqual(field_names, ['features', 'label']) self.assertEqual(expect_features, data[0]) self.assertEqual(expect_labels, data[1])
def _do_test_hive_specified_db(self, conn): create_db = '''create database if not exists test_db''' create_tbl = '''create table test_db.tbl (features string, label int) ROW FORMAT DELIMITED FIELDS TERMINATED BY "\001"''' drop_tbl = '''drop table if exists test_db.tbl''' select_tbl = '''select * from test_db.tbl''' table_schema = ["label", "features"] values = [(1, '5,6,1,2')] * 10 self.assertTrue(conn.execute(create_db)) self.assertTrue(conn.execute(drop_tbl)) self.assertTrue(conn.execute(create_tbl)) with buffered_db_writer(conn, "test_db.tbl", table_schema, buff_size=10) as w: for row in values: w.write(row) field_names, data = execute(conn, select_tbl) expect_result = [('5,6,1,2', 1)] * 10 self.assertEqual(field_names, ['features', 'label']) self.assertEqual(expect_result, data)
def shap_explain(booster, datasource, dataset, summary_params, result_table="", is_pai=False, oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None): tree_explainer = shap.TreeExplainer(booster) shap_values = tree_explainer.shap_values(dataset) if result_table: if is_pai: conn = PaiIOConnection.from_table(result_table) else: conn = db.connect_with_data_source(datasource) # TODO(typhoonzero): the shap_values is may be a # list of shape [3, num_samples, num_features], # use the first dimension here, should find out # when to use the other two. When shap_values is # not a list it can be directly used. if isinstance(shap_values, list): to_write = shap_values[0] else: to_write = shap_values columns = list(dataset.columns) with db.buffered_db_writer(conn, result_table, columns) as w: for row in to_write: w.write(list(row)) conn.close() if summary_params.get("plot_type") == "decision": shap_interaction_values = tree_explainer.shap_interaction_values( dataset) expected_value = tree_explainer.expected_value if isinstance(shap_interaction_values, list): shap_interaction_values = shap_interaction_values[0] if isinstance(expected_value, list): expected_value = expected_value[0] plot_func = lambda: shap.decision_plot( # noqa: E731 expected_value, shap_interaction_values, dataset, show=False, feature_display_range=slice(None, -40, -1), alpha=1) else: plot_func = lambda: shap.summary_plot( # noqa: E731 shap_values, dataset, show=False, **summary_params) explainer.plot_and_save(plot_func, oss_dest=oss_dest, oss_ak=oss_ak, oss_sk=oss_sk, oss_endpoint=oss_endpoint, oss_bucket_name=oss_bucket_name, filename='summary')
def _store_predict_result(preds, result_table, result_column_names, train_label_idx, feature_file_name, conn): """ Save the prediction result in the table. Args: preds: the prediction result to save. result_table (str): the result table name. result_column_names (list[str]): the result column names. train_label_idx (int): the index where the trained label is inside result_column_names. feature_file_name (str): the file path where the feature dumps. conn: the database connection object. Returns: None. """ with db.buffered_db_writer(conn, result_table, result_column_names) as w: with open(feature_file_name, "r") as feature_file_read: line_no = 0 for line in feature_file_read.readlines(): if not line: break row = [ item for i, item in enumerate(line.strip().split( DMATRIX_FILE_SEP)) if i != train_label_idx ] row.append(str(preds[line_no])) w.write(row) line_no += 1
def predict_and_store_result(bst, dpred, feature_file_id, model_params, selected_cols, train_label_name, pred_label_name, feature_column_names, feature_metas, is_pai, conn, result_table, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass): preds = bst.predict(dpred) # TODO(yancey1989): should save train_params and model_params # not only on PAI submitter # TODO(yancey1989): output the original result for various # objective function. if model_params: obj = model_params["objective"] if obj.startswith("binary:"): preds = (preds > 0.5).astype(int) elif obj.startswith("multi:"): preds = np.argmax(np.array(preds), axis=1) else: # using the original prediction result of predict API by default pass else: # prediction output with multi-class job has two dimensions, this # is a temporary way, can remove this else branch when we can load # the model meta not only on PAI submitter. if len(preds.shape) == 2: preds = np.argmax(np.array(preds), axis=1) if is_pai: feature_file_read = open("predict.txt.raw", "r") else: feature_file_read = open( "predict.raw.dir/predict.txt_%d" % feature_file_id, "r") result_column_names = selected_cols[:] # remove train_label_name from result column, if train_label_name == "" or # the train_label_name is not selected, the index should be -1 try: train_label_index = selected_cols.index(train_label_name) except ValueError: train_label_index = -1 if train_label_index != -1: del result_column_names[train_label_index] result_column_names.append(pred_label_name) line_no = 0 with db.buffered_db_writer(conn, result_table, result_column_names, 100) as w: while True: line = feature_file_read.readline() if not line: break # FIXME(typhoonzero): how to output columns that are not used # as features, like ids? row = [ item for i, item in enumerate(line.strip().split("/")) if i != train_label_index ] row.append(str(preds[line_no])) w.write(row) line_no += 1
def write_with_generator(datasource, table, gen): """Write data into a table, the written data comes from the input generator. Args: datasource: string The connection string to connectDBMS. table: string The table name written. gen: Generator The generator to generte the data to insert into table. """ conn = connect_with_data_source(datasource) _drop_table_if_exists(conn, table) _create_table(conn, table) idx = 0 with buffered_db_writer(conn, table, ["id", "block"]) as w: for d in gen(): block = base64.b64encode(d) row = [idx, block] w.write(row) idx += 1 conn.close()
def xgb_native_explain(booster, datasource, result_table): if not result_table: raise ValueError( "XGBoostExplainer must use with INTO to output result to a table.") gain_map = booster.get_score(importance_type="gain") fscore_map = booster.get_fscore() conn = db.connect_with_data_source(datasource) all_feature_keys = list(gain_map.keys()) all_feature_keys.sort() columns = ["feature", "fscore", "gain"] dtypes = [ DataType.to_db_field_type(conn.driver, DataType.STRING), DataType.to_db_field_type(conn.driver, DataType.FLOAT32), DataType.to_db_field_type(conn.driver, DataType.FLOAT32), ] _create_table(conn, result_table, columns, dtypes) with db.buffered_db_writer(conn, result_table, columns) as w: for fkey in all_feature_keys: row = [fkey, fscore_map[fkey], gain_map[fkey]] w.write(list(row)) conn.close()
def write_shap_values(shap_values, driver, conn, result_table, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass): with db.buffered_db_writer(driver, conn, result_table, feature_column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: for row in shap_values: w.write(list(row))
def __init__(self, conn, table): _drop_table_if_exists(conn, table) _create_table(conn, table) self.context_manager = buffered_db_writer(conn, table, ["id", "block"]) self.writer = self.context_manager.__enter__() self.row_idx = 0 self.buffer = b''
def write_shap_values(shap_values, conn, result_table, feature_column_names): with db.buffered_db_writer(conn, result_table, feature_column_names, 100) as w: for row in shap_values: # NOTE(typhoonzero): assume all shap explain value are float, and # there's no INT or other types of values yet. row_float = [float(c) for c in row] w.write(list(row_float))
def shap_explain(booster, datasource, dataset, summary_params, result_table): tree_explainer = shap.TreeExplainer(booster) shap_values = tree_explainer.shap_values(dataset) if result_table: conn = db.connect_with_data_source(datasource) # TODO(typhoonzero): the shap_values is may be a # list of shape [3, num_samples, num_features], # use the first dimension here, should find out # when to use the other two. When shap_values is # not a list it can be directly used. if isinstance(shap_values, list): to_write = shap_values[0] else: to_write = shap_values columns = list(dataset.columns) dtypes = [DataType.to_db_field_type(conn.driver, DataType.FLOAT32) ] * len(columns) _create_table(conn, result_table, columns, dtypes) with db.buffered_db_writer(conn, result_table, columns) as w: for row in to_write: w.write(list(row)) conn.close() if summary_params.get("plot_type") == "decision": shap_interaction_values = tree_explainer.shap_interaction_values( dataset) expected_value = tree_explainer.expected_value if isinstance(shap_interaction_values, list): shap_interaction_values = shap_interaction_values[0] if isinstance(expected_value, list): expected_value = expected_value[0] plot_func = lambda: shap.decision_plot( # noqa: E731 expected_value, shap_interaction_values, dataset, show=False, feature_display_range=slice(None, -40, -1), alpha=1) else: plot_func = lambda: shap.summary_plot( # noqa: E731 shap_values, dataset, show=False, **summary_params) filename = 'summary.png' with temp_file.TemporaryDirectory(as_cwd=True): explainer.plot_and_save(plot_func, filename=filename) with open(filename, 'rb') as f: img = f.read() img = base64.b64encode(img) if six.PY3: img = img.decode('utf-8') img = "<div align='center'><img src='data:image/png;base64,%s' /></div>" \ % img print(img)
def write_result_metrics(result_metrics, metric_name_list, result_table, conn): # NOTE: assume that the result table is already created with columns: # loss | metric_names ... column_names = metric_name_list with buffered_db_writer(conn, result_table, column_names, 100) as w: row = [] for key in metric_name_list: row.append(result_metrics[key]) w.write(row)
def explain(datasource, select, feature_field_meta, feature_column_names, label_meta, summary_params, explainer="TreeExplainer", result_table="", is_pai=False, pai_explain_table="", oss_dest=None, oss_ak=None, oss_sk=None, oss_endpoint=None, oss_bucket_name=None, transform_fn=None, feature_column_code=""): if explainer == "XGBoostExplainer": if result_table == "": raise ValueError("""XGBoostExplainer must use with INTO to output result to a table.""") bst = xgb.Booster() bst.load_model("my_model") gain_map = bst.get_score(importance_type="gain") fscore_map = bst.get_fscore() if is_pai: from runtime.dbapi.paiio import PaiIOConnection conn = PaiIOConnection.from_table(result_table) else: conn = db.connect_with_data_source(datasource) all_feature_keys = list(gain_map.keys()) all_feature_keys.sort() with db.buffered_db_writer(conn, result_table, ["feature", "fscore", "gain"], 100) as w: for fkey in all_feature_keys: row = [fkey, fscore_map[fkey], gain_map[fkey]] w.write(list(row)) else: # when explainer is "" or "TreeExplainer" use SHAP by default. shap_explain(datasource, select, feature_field_meta, feature_column_names, label_meta, summary_params, result_table=result_table, is_pai=is_pai, pai_explain_table=pai_explain_table, oss_dest=oss_dest, oss_ak=oss_ak, oss_sk=oss_sk, oss_endpoint=oss_endpoint, oss_bucket_name=oss_bucket_name, transform_fn=transform_fn, feature_column_code=feature_column_code)
def evaluate_and_store_result(bst, dpred, feature_file_id, validation_metrics, model_params, feature_column_names, label_meta, is_pai, conn, result_table): preds = bst.predict(dpred) if model_params: obj = model_params["objective"] # binary:hinge output class labels if obj.startswith("binary:logistic"): preds = (preds > 0.5).astype(int) # multi:softmax output class labels elif obj.startswith("multi:softprob"): preds = np.argmax(np.array(preds), axis=1) # TODO(typhoonzero): deal with binary:logitraw when needed. else: # prediction output with multi-class job has two dimensions, this # is a temporary way, can remove this else branch when we can load # the model meta not only on PAI submitter. if len(preds.shape) == 2: preds = np.argmax(np.array(preds), axis=1) if is_pai: feature_file_read = open("predict.txt", "r") else: feature_file_read = open("predict.txt_%d" % feature_file_id, "r") y_test_list = [] for line in feature_file_read: row = [i for i in line.strip().split(DMATRIX_FILE_SEP)] # DMatrix store label in the first column if label_meta["dtype"] == "float32" or label_meta[ "dtype"] == DataType.FLOAT32: label = float(row[0]) elif label_meta["dtype"] == "int64" or label_meta[ "dtype"] == "int32" or label_meta["dtype"] == DataType.INT64: label = int(row[0]) else: raise ValueError("unsupported label dtype: %s" % label_meta["dtype"]) y_test_list.append(label) y_test = np.array(y_test_list) evaluate_results = dict() for metric_name in validation_metrics: if metric_name not in SKLEARN_METRICS: raise ValueError("unsupported metric: %s" % metric_name) metric_func = getattr(sklearn.metrics, metric_name) metric_value = metric_func(y_test, preds) evaluate_results[metric_name] = metric_value # write evaluation result to result table result_columns = ["loss"] + validation_metrics with db.buffered_db_writer(conn, result_table, result_columns, 100) as w: row = ["0.0"] for mn in validation_metrics: row.append(str(evaluate_results[mn])) w.write(row)
def evaluate_and_store_result(bst, dpred, feature_file_id, validation_metrics, model_params, feature_column_names, label_meta, is_pai, conn, result_table, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass): preds = bst.predict(dpred) # FIXME(typhoonzero): copied from predict.py if model_params: obj = model_params["objective"] if obj.startswith("binary:"): preds = (preds > 0.5).astype(int) elif obj.startswith("multi:"): preds = np.argmax(np.array(preds), axis=1) else: # using the original prediction result of predict API by default pass else: # prediction output with multi-class job has two dimensions, this # is a temporary way, can remove this else branch when we can load # the model meta not only on PAI submitter. if len(preds.shape) == 2: preds = np.argmax(np.array(preds), axis=1) if is_pai: feature_file_read = open("predict.txt", "r") else: feature_file_read = open("predict.txt_%d" % feature_file_id, "r") y_test_list = [] for line in feature_file_read: row = [i for i in line.strip().split("\t")] # DMatrix store label in the first column if label_meta["dtype"] == "float32": label = float(row[0]) elif label_meta["dtype"] == "int64" or label_meta["dtype"] == "int32": label = int(row[0]) else: raise ValueError("unsupported label dtype: %s" % label_meta["dtype"]) y_test_list.append(label) y_test = np.array(y_test_list) evaluate_results = dict() for metric_name in validation_metrics: if metric_name not in SKLEARN_METRICS: raise ValueError("unsupported metric: %s" % metric_name) metric_func = getattr(sklearn.metrics, metric_name) metric_value = metric_func(y_test, preds) evaluate_results[metric_name] = metric_value # write evaluation result to result table result_columns = ["loss"] + validation_metrics with db.buffered_db_writer(conn, result_table, result_columns, 100) as w: row = ["0.0"] for mn in validation_metrics: row.append(str(evaluate_results[mn])) w.write(row)
def write_result_metrics(result_metrics, metric_name_list, result_table, driver, conn, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass): # NOTE: assume that the result table is already created with columns: # loss | metric_names ... column_names = metric_name_list with buffered_db_writer(driver, conn, result_table, column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: row = [] for key in metric_name_list: row.append(result_metrics[key]) w.write(row)
def save_solved_result_in_db(solved_result, data_frame, variables, result_value_name, datasource, result_table): """ Save the solved result of the Pyomo model into the database. Args: solved_result (tuple(numpy.ndarray, float)): a numpy array which indicates the solved x, and a float value which indicates the objective function value. data_frame (panda.DataFrame): the input table data. variables (list[str]): the variable names to be optimized. result_value_name (str): the result value name to be optimized. datasource (str): the database connection URI. result_table (str): the table name to save the solved results. Returns: None """ column_names = [] for col in data_frame.columns: found = False for var in variables: if var.lower() == col.lower(): found = True break if found: column_names.append(col) data_frame = data_frame[[*column_names]] result_value_name = generate_unique_result_value_name( columns=data_frame.columns, result_value_name=result_value_name, variables=variables) column_names.append(result_value_name) data_frame[result_value_name] = solved_result[0] conn = db.connect_with_data_source(datasource) with db.buffered_db_writer(conn, result_table, column_names) as w: for i in six.moves.range(len(data_frame)): rows = list(data_frame.loc[i]) w.write(rows) print('Solved result is:') print(data_frame) print('Saved in {}.'.format(result_table)) print('Objective value is {}'.format(solved_result[1]))
def _store_evaluate_result(preds, feature_file_name, label_desc, result_table, result_column_names, validation_metrics, conn): """ Save the evaluation result in the table. Args: preds: the prediction result. feature_file_name (str): the file path where the feature dumps. label_desc (FieldDesc): the label FieldDesc object. result_table (str): the result table name. result_column_names (list[str]): the result column names. validation_metrics (list[str]): the evaluation metric names. conn: the database connection object. Returns: None. """ y_test = [] with open(feature_file_name, 'r') as f: for line in f.readlines(): row = [i for i in line.strip().split("\t")] # DMatrix store label in the first column if label_desc.dtype == DataType.INT64: y_test.append(int(row[0])) elif label_desc.dtype == DataType.FLOAT32: y_test.append(float(row[0])) else: raise TypeError("unsupported data type {}".format( label_desc.dtype)) y_test = np.array(y_test) evaluate_results = dict() for metric_name in validation_metrics: metric_name = metric_name.strip() if metric_name not in SKLEARN_METRICS: raise ValueError("unsupported metrics %s" % metric_name) metric_func = getattr(sklearn.metrics, metric_name) metric_value = metric_func(y_test, preds) evaluate_results[metric_name] = metric_value # write evaluation result to result table with db.buffered_db_writer(conn, result_table, result_column_names) as w: row = ["0.0"] for mn in validation_metrics: row.append(str(evaluate_results[mn])) w.write(row)
def _do_test(self, conn): table_name = "test_db" table_schema = ["features", "label"] values = [('5,6,1,2', 1)] * 10 conn.execute(self.drop_statement) if conn.driver == "hive": conn.execute(self.hive_create_statement) else: conn.execute(self.create_statement) with buffered_db_writer(conn, table_name, table_schema, buff_size=10) as w: for row in values: w.write(row) field_names, data = execute(conn, self.select_statement) self.assertEqual(table_schema, field_names) self.assertEqual(values, data)
def keras_predict(estimator, model_params, save, result_table, is_pai, pai_table, feature_column_names, feature_metas, train_label_name, result_col_name, datasource, select, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass): classifier = init_model_with_feature_column(estimator, model_params) classifier_pkg = sys.modules[estimator.__module__] conn = None if is_pai: driver = "pai_maxcompute" else: conn = db.connect_with_data_source(datasource) driver = conn.driver if is_pai: pai_table_parts = pai_table.split(".") formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) gen = db.pai_maxcompute_db_generator(formatted_pai_table) selected_cols = feature_column_names else: gen = db.db_generator(conn, select) selected_cols = db.selected_cols(conn, select) def eval_input_fn(batch_size, cache=False): feature_types = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) tf_gen = tf_generator(gen, selected_cols, feature_column_names, feature_metas) dataset = tf.data.Dataset.from_generator(tf_gen, (tuple(feature_types), )) ds_mapper = functools.partial( parse_sparse_feature_predict, feature_column_names=feature_column_names, feature_metas=feature_metas) dataset = dataset.map(ds_mapper).batch(batch_size) if cache: dataset = dataset.cache() return dataset # NOTE: always use batch_size=1 when predicting to get the pairs of # features and predict results to insert into result table. pred_dataset = eval_input_fn(1) one_batch = next(iter(pred_dataset)) # NOTE: must run predict one batch to initialize parameters # see: https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models classifier.predict_on_batch(one_batch) classifier.load_weights(save) pred_dataset = eval_input_fn(1, cache=True).make_one_shot_iterator() column_names = selected_cols[:] train_label_index = selected_cols.index(train_label_name) if train_label_index != -1: del column_names[train_label_index] column_names.append(result_col_name) with db.buffered_db_writer(driver, conn, result_table, column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: for features in pred_dataset: result = classifier.predict_on_batch(features) # FIXME(typhoonzero): determine the predict result is classification by # adding the prediction result together to see if it is close to 1.0. if len(result[0]) == 1: # regression result result = result[0][0] else: sum = 0 for i in result[0]: sum += i if np.isclose(sum, 1.0): # classification result result = result[0].argmax(axis=-1) else: result = result[0] # multiple regression result row = [] for idx, name in enumerate(feature_column_names): val = features[name].numpy()[0][0] row.append(str(val)) if isinstance(result, np.ndarray): if len(result) > 1: # NOTE(typhoonzero): if the output dimension > 1, format output tensor # using a comma separated string. Only available for keras models. row.append(",".join([str(i) for i in result])) else: row.append(str(result[0])) else: row.append(str(result)) w.write(row) del pred_dataset
def shap_explain(booster, datasource, select, summary_params, result_table, model): train_fc_map = model.get_meta("features") label_meta = model.get_meta("label").get_field_desc()[0].to_dict( dtype_to_string=True) field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) # NOTE: in the current implementation, we are generating a transform_fn # from the COLUMN clause. The transform_fn is executed during the process # of dumping the original data into DMatrix SVM file. compiled_fc = compile_ir_feature_columns(train_fc_map, model.get_type()) transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *compiled_fc["feature_columns"]) dataset = xgb_shap_dataset(datasource, select, feature_column_names, label_meta, feature_metas, transform_fn) tree_explainer = shap.TreeExplainer(booster) shap_values = tree_explainer.shap_values(dataset) if result_table: conn = db.connect_with_data_source(datasource) # TODO(typhoonzero): the shap_values is may be a # list of shape [3, num_samples, num_features], # use the first dimension here, should find out # when to use the other two. When shap_values is # not a list it can be directly used. if isinstance(shap_values, list): to_write = shap_values[0] else: to_write = shap_values columns = list(dataset.columns) dtypes = [DataType.to_db_field_type(conn.driver, DataType.FLOAT32) ] * len(columns) _create_table(conn, result_table, columns, dtypes) with db.buffered_db_writer(conn, result_table, columns) as w: for row in to_write: w.write(list(row)) conn.close() if summary_params.get("plot_type") == "decision": shap_interaction_values = tree_explainer.shap_interaction_values( dataset) expected_value = tree_explainer.expected_value if isinstance(shap_interaction_values, list): shap_interaction_values = shap_interaction_values[0] if isinstance(expected_value, list): expected_value = expected_value[0] plot_func = lambda: shap.decision_plot( # noqa: E731 expected_value, shap_interaction_values, dataset, show=False, feature_display_range=slice(None, -40, -1), alpha=1) else: plot_func = lambda: shap.summary_plot( # noqa: E731 shap_values, dataset, show=False, **summary_params) filename = 'summary.png' with temp_file.TemporaryDirectory(as_cwd=True): explainer.plot_and_save(plot_func, filename=filename) with open(filename, 'rb') as f: img = f.read() img = base64.b64encode(img) if six.PY3: img = img.decode('utf-8') img = "<div align='center'><img src='data:image/png;base64,%s' /></div>" \ % img print(img)
def estimator_predict(result_table, feature_column_names, feature_metas, train_label_name, result_col_name, conn, predict_generator, selected_cols): write_cols = selected_cols[:] try: train_label_index = selected_cols.index(train_label_name) except ValueError: train_label_index = -1 if train_label_index != -1: del write_cols[train_label_index] write_cols.append(result_col_name) # load from the exported model with open("exported_path", "r") as fn: export_path = fn.read() if tf_is_version2(): imported = tf.saved_model.load(export_path) else: imported = tf.saved_model.load_v2(export_path) def add_to_example(example, x, i): feature_name = feature_column_names[i] dtype_str = feature_metas[feature_name]["dtype"] if feature_metas[feature_name]["delimiter"] != "": if feature_metas[feature_name]["delimiter_kv"] != "": keys = x[0][i][0].flatten() weights = x[0][i][1].flatten() weight_dtype_str = feature_metas[feature_name]["dtype_weight"] if (dtype_str == "float32" or dtype_str == "float64" or dtype_str == DataType.FLOAT32): raise ValueError( "not supported key-value feature with key type float") elif (dtype_str == "int32" or dtype_str == "int64" or dtype_str == DataType.INT64): example.features.feature[ feature_name].int64_list.value.extend(list(keys)) elif (dtype_str == "string" or dtype_str == DataType.STRING): example.features.feature[ feature_name].bytes_list.value.extend(list(keys)) if (weight_dtype_str == "float32" or weight_dtype_str == "float64" or weight_dtype_str == DataType.FLOAT32): example.features.feature["_".join( [feature_name, "weight"])].float_list.value.extend(list(weights)) else: raise ValueError( "not supported key value column weight data type: %s" % weight_dtype_str) else: # NOTE(typhoonzero): sparse feature will get # (indices,values,shape) here, use indices only values = x[0][i][0].flatten() if (dtype_str == "float32" or dtype_str == "float64" or dtype_str == DataType.FLOAT32): example.features.feature[ feature_name].float_list.value.extend(list(values)) elif (dtype_str == "int32" or dtype_str == "int64" or dtype_str == DataType.INT64): example.features.feature[ feature_name].int64_list.value.extend(list(values)) else: if (dtype_str == "float32" or dtype_str == "float64" or dtype_str == DataType.FLOAT32): # need to pass a tuple(float, ) example.features.feature[feature_name].float_list.value.extend( (float(x[0][i][0]), )) elif (dtype_str == "int32" or dtype_str == "int64" or dtype_str == DataType.INT64): example.features.feature[feature_name].int64_list.value.extend( (int(x[0][i][0]), )) elif dtype_str == "string" or dtype_str == DataType.STRING: example.features.feature[feature_name].bytes_list.value.extend( x[0][i]) def predict(x): example = tf.train.Example() for i in range(len(feature_column_names)): add_to_example(example, x, i) return imported.signatures["predict"]( examples=tf.constant([example.SerializeToString()])) with db.buffered_db_writer(conn, result_table, write_cols, 100) as w: for row, _ in predict_generator(): features = db.read_features_from_row(row, selected_cols, feature_column_names, feature_metas, is_xgboost=False) result = predict((features, )) if train_label_index != -1 and len(row) > train_label_index: del row[train_label_index] if "class_ids" in result: row.append(str(result["class_ids"].numpy()[0][0])) else: # regression predictions row.append(str(result["predictions"].numpy()[0][0])) w.write(row)
def keras_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, train_label_name, result_col_name, conn, predict_generator, selected_cols, extra_result_cols): pop_optimizer_and_loss(model_params) classifier = init_model_with_feature_column(estimator, model_params) def eval_input_fn(batch_size, cache=False): feature_types = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) tf_gen = tf_generator(predict_generator, selected_cols, feature_column_names, feature_metas) dataset = tf.data.Dataset.from_generator(tf_gen, (tuple(feature_types), )) ds_mapper = functools.partial( parse_sparse_feature_predict, feature_column_names=feature_column_names, feature_metas=feature_metas) dataset = dataset.map(ds_mapper).batch(batch_size) if cache: dataset = dataset.cache() return dataset def to_feature_sample(row, selected_cols): features = {} for name in feature_column_names: row_val = row[selected_cols.index(name)] if feature_metas[name].get("delimiter_kv", "") != "": # kv list that should be parsed to two features. if feature_metas[name]["is_sparse"]: features[name] = tf.SparseTensor( row_val[0], tf.ones_like(tf.reshape(row_val[0], [-1])), row_val[2]) features["_".join([name, "weight"])] = tf.SparseTensor(*row_val) else: raise ValueError( "not supported DENSE column with key:value" "list format.") else: if feature_metas[name]["is_sparse"]: features[name] = tf.SparseTensor(*row_val) else: features[name] = tf.constant(([row_val], )) return features if not hasattr(classifier, 'sqlflow_predict_one'): # NOTE: load_weights should be called by keras models only. # NOTE: always use batch_size=1 when predicting to get the pairs of # features and predict results to insert into result table. pred_dataset = eval_input_fn(1) one_batch = next(iter(pred_dataset)) # NOTE: must run predict one batch to initialize parameters. See: # https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models # noqa: E501 classifier.predict_on_batch(one_batch) load_keras_model_weights(classifier, save) pred_dataset = eval_input_fn(1, cache=True).make_one_shot_iterator() column_names = selected_cols[:] try: train_label_index = selected_cols.index(train_label_name) except: # noqa: E722 train_label_index = -1 if train_label_index != -1: del column_names[train_label_index] column_names.append(result_col_name) column_names.extend(extra_result_cols) with db.buffered_db_writer(conn, result_table, column_names, 100) as w: for row, _ in predict_generator(): features = to_feature_sample(row, column_names) if hasattr(classifier, 'sqlflow_predict_one'): result = classifier.sqlflow_predict_one(features) else: result = classifier.predict_on_batch(features) if extra_result_cols: assert isinstance( result, tuple ), "TO PREDICT must return a " \ "tuple when predict.extra_outputs is not empty" assert len(extra_result_cols) + 1 <= len( result ), "TO PREDICT must return at least " \ "%d items instead of %d" % (len(extra_result_cols) + 1, len(result)) extra_pred_outputs = result[1:len(extra_result_cols) + 1] result = result[0:1] else: extra_pred_outputs = None # FIXME(typhoonzero): determine the predict result is # classification by adding the prediction result together # to see if it is close to 1.0. if len(result[0]) == 1: # regression result result = result[0][0] else: sum = 0 for i in result[0]: sum += i if np.isclose(sum, 1.0): # classification result result = result[0].argmax(axis=-1) else: result = result[0] # multiple regression result row.append(encode_pred_result(result)) if extra_pred_outputs is not None: row.extend([encode_pred_result(p) for p in extra_pred_outputs]) if train_label_index != -1 and len(row) > train_label_index: del row[train_label_index] w.write(row) del pred_dataset
def keras_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, train_label_name, result_col_name, driver, conn, predict_generator, selected_cols, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass): classifier = init_model_with_feature_column(estimator, model_params) def eval_input_fn(batch_size, cache=False): feature_types = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) tf_gen = tf_generator(predict_generator, selected_cols, feature_column_names, feature_metas) dataset = tf.data.Dataset.from_generator(tf_gen, (tuple(feature_types), )) ds_mapper = functools.partial( parse_sparse_feature_predict, feature_column_names=feature_column_names, feature_metas=feature_metas) dataset = dataset.map(ds_mapper).batch(batch_size) if cache: dataset = dataset.cache() return dataset if not hasattr(classifier, 'sqlflow_predict_one'): # NOTE: load_weights should be called by keras models only. # NOTE: always use batch_size=1 when predicting to get the pairs of # features and predict results to insert into result table. pred_dataset = eval_input_fn(1) one_batch = next(iter(pred_dataset)) # NOTE: must run predict one batch to initialize parameters. See: # https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models # noqa: E501 classifier.predict_on_batch(one_batch) classifier.load_weights(save) pred_dataset = eval_input_fn(1, cache=True).make_one_shot_iterator() column_names = selected_cols[:] try: train_label_index = selected_cols.index(train_label_name) except: # noqa: E722 train_label_index = -1 if train_label_index != -1: del column_names[train_label_index] column_names.append(result_col_name) with db.buffered_db_writer(driver, conn, result_table, column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: for features in pred_dataset: if hasattr(classifier, 'sqlflow_predict_one'): result = classifier.sqlflow_predict_one(features) else: result = classifier.predict_on_batch(features) # FIXME(typhoonzero): determine the predict result is # classification by adding the prediction result together # to see if it is close to 1.0. if len(result[0]) == 1: # regression result result = result[0][0] else: sum = 0 for i in result[0]: sum += i if np.isclose(sum, 1.0): # classification result result = result[0].argmax(axis=-1) else: result = result[0] # multiple regression result row = [] for idx, name in enumerate(feature_column_names): val = features[name].numpy()[0][0] row.append(str(val)) if isinstance(result, np.ndarray): if len(result) > 1: # NOTE(typhoonzero): if the output dimension > 1, format # output tensor using a comma separated string. Only # available for keras models. row.append(",".join([str(i) for i in result])) else: row.append(str(result[0])) else: row.append(str(result)) w.write(row) del pred_dataset
def write_dfc_result(dfc_mean, gain, result_table, conn, feature_column_names): with buffered_db_writer(conn, result_table, ["feature", "dfc", "gain"], 100) as w: for row_name in feature_column_names: w.write([row_name, dfc_mean.loc[row_name], gain[row_name]])
def write_shap_values(shap_values, conn, result_table, feature_column_names): with db.buffered_db_writer(conn, result_table, feature_column_names, 100) as w: for row in shap_values: w.write(list(row))
def estimator_predict(estimator, model_params, save, result_table, feature_column_names, feature_column_names_map, feature_columns, feature_metas, train_label_name, result_col_name, driver, conn, predict_generator, selected_cols, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass): write_cols = selected_cols[:] try: train_label_index = selected_cols.index(train_label_name) except ValueError: train_label_index = -1 if train_label_index != -1: del write_cols[train_label_index] write_cols.append(result_col_name) # load from the exported model with open("exported_path", "r") as fn: export_path = fn.read() if tf_is_version2(): imported = tf.saved_model.load(export_path) else: imported = tf.saved_model.load_v2(export_path) def add_to_example(example, x, i): feature_name = feature_column_names[i] dtype_str = feature_metas[feature_name]["dtype"] if feature_metas[feature_name]["delimiter"] != "": if feature_metas[feature_name]["is_sparse"]: # NOTE(typhoonzero): sparse feature will get # (indices,values,shape) here, use indices only values = x[0][i][0].flatten() else: values = x[0][i].flatten() if dtype_str == "float32" or dtype_str == "float64": example.features.feature[feature_name].float_list.value.extend( list(values)) elif dtype_str == "int32" or dtype_str == "int64": example.features.feature[feature_name].int64_list.value.extend( list(values)) else: if "feature_columns" in feature_columns: idx = feature_column_names.index(feature_name) fc = feature_columns["feature_columns"][idx] else: # DNNLinearCombinedXXX have dnn_feature_columns and # linear_feature_columns param. idx = -1 try: idx = feature_column_names_map[ "dnn_feature_columns"].index(feature_name) fc = feature_columns["dnn_feature_columns"][idx] except: # noqa: E722 try: idx = feature_column_names_map[ "linear_feature_columns"].index(feature_name) fc = feature_columns["linear_feature_columns"][idx] except: # noqa: E722 pass if idx == -1: raise ValueError( "can not found feature %s in all feature columns") if dtype_str == "float32" or dtype_str == "float64": # need to pass a tuple(float, ) example.features.feature[feature_name].float_list.value.extend( (float(x[0][i][0]), )) elif dtype_str == "int32" or dtype_str == "int64": numeric_type = type(tf.feature_column.numeric_column("tmp")) if type(fc) == numeric_type: example.features.feature[ feature_name].float_list.value.extend( (float(x[0][i][0]), )) else: example.features.feature[ feature_name].int64_list.value.extend( (int(x[0][i][0]), )) elif dtype_str == "string": example.features.feature[feature_name].bytes_list.value.extend( x[0][i]) def predict(x): example = tf.train.Example() for i in range(len(feature_column_names)): add_to_example(example, x, i) return imported.signatures["predict"]( examples=tf.constant([example.SerializeToString()])) with db.buffered_db_writer(driver, conn, result_table, write_cols, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: for row, _ in predict_generator(): features = db.read_features_from_row(row, selected_cols, feature_column_names, feature_metas) result = predict((features, )) if train_label_index != -1 and len(row) > train_label_index: del row[train_label_index] if "class_ids" in result: row.append(str(result["class_ids"].numpy()[0][0])) else: # regression predictions row.append(str(result["predictions"].numpy()[0][0])) w.write(row)