def _do_test_hive_specified_db(self, driver, conn, hdfs_namenode_addr="", hive_location=""): create_db = '''create database test_db''' create_tbl = '''create table test_db.tbl (features string, label int) ROW FORMAT DELIMITED FIELDS TERMINATED BY "\001"''' drop_tbl = '''drop table if exists test_db.tbl''' select_tbl = '''select * from test_db.tbl''' table_schema = ["label", "features"] values = [(1, '5,6,1,2')] * 10 execute(driver, conn, create_db) execute(driver, conn, drop_tbl) execute(driver, conn, create_tbl) with buffered_db_writer(driver, conn, "test_db.tbl", table_schema, buff_size=10, hdfs_namenode_addr=hdfs_namenode_addr, hive_location=hive_location) as w: for row in values: w.write(row) field_names, data = execute(driver, conn, select_tbl) expect_features = ['5,6,1,2'] * 10 expect_labels = [1] * 10 self.assertEqual(field_names, ['features', 'label']) self.assertEqual(expect_features, data[0]) self.assertEqual(expect_labels, data[1])
def _do_test(self, driver, conn, hdfs_namenode_addr="", hive_location=""): table_name = "test_db" table_schema = ["label", "features"] values = [(1, '5,6,1,2')] * 10 execute(driver, conn, self.drop_statement) if driver == "hive": execute(driver, conn, self.hive_create_statement) else: execute(driver, conn, self.create_statement) with buffered_db_writer(driver, conn, table_name, table_schema, buff_size=10, hdfs_namenode_addr=hdfs_namenode_addr, hive_location=hive_location) as w: for row in values: w.write(row) field_names, data = execute(driver, conn, self.select_statement) expect_features = ['5,6,1,2'] * 10 expect_labels = [1] * 10 self.assertEqual(field_names, ['features', 'label']) self.assertEqual(expect_features, data[0]) self.assertEqual(expect_labels, data[1])
def estimator_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, label_meta, datasource, select, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass, is_pai, pai_table): classifier = estimator(**model_params) conn = connect_with_data_source(datasource) def fast_input_fn(generator): feature_types = [] for name in feature_column_names: if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) def _inner_input_fn(): if is_pai: dataset = pai_maxcompute_input_fn(pai_table, datasource, feature_column_names, feature_metas, label_meta) else: dataset = tf.data.Dataset.from_generator( generator, (tuple(feature_types), eval( "tf.%s" % label_meta["dtype"]))) ds_mapper = functools.partial( parse_sparse_feature, feature_column_names=feature_column_names, feature_metas=feature_metas) dataset = dataset.map(ds_mapper) dataset = dataset.batch(1).cache() iterator = dataset.make_one_shot_iterator() features = iterator.get_next() return features return _inner_input_fn column_names = feature_column_names[:] column_names.append(label_meta["feature_name"]) fast_predictor = FastPredict(classifier, fast_input_fn) with buffered_db_writer(conn.driver, conn, result_table, column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: for features in db_generator(conn.driver, conn, select, feature_column_names, label_meta["feature_name"], feature_metas)(): result = fast_predictor.predict(features) row = [] for idx, _ in enumerate(feature_column_names): val = features[0][idx][0] row.append(str(val)) if "class_ids" in list(result)[0]: row.append(str(list(result)[0]["class_ids"][0])) else: # regression predictions row.append(str(list(result)[0]["predictions"][0])) w.write(row)
def write_dfc_result(dfc_mean, gain, result_table, conn, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass): with buffered_db_writer(conn.driver, conn, result_table, ["feature", "dfc", "gain"], 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: for row_name in feature_column_names: w.write([row_name, dfc_mean.loc[row_name], gain[row_name]])
def write_shap_values(shap_values, driver, conn, result_table, feature_column_names, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass): with buffered_db_writer(driver, conn, result_table, feature_column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: for row in shap_values: w.write(list(row))
def pred(datasource, select, feature_metas, feature_column_names, label_meta, result_table, is_pai=False, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass="", pai_table=""): # TODO(typhoonzero): support running on PAI without MaxCompute AK/SK connection. if not is_pai: conn = db.connect_with_data_source(datasource) label_name = label_meta["feature_name"] dpred = xgb_dataset(datasource, 'predict.txt', select, feature_metas, feature_column_names, None, is_pai, pai_table, True) bst = xgb.Booster({'nthread': 4}) # init model bst.load_model("my_model") # load data print("Start predicting XGBoost model...") preds = bst.predict(dpred) # TODO(Yancey1989): using the train parameters to decide regression model or classifier model if len(preds.shape) == 2: # classifier result preds = np.argmax(np.array(preds), axis=1) feature_file_read = open("predict.txt", "r") result_column_names = feature_column_names result_column_names.append(label_name) line_no = 0 if is_pai: driver = "pai_maxcompute" conn = None else: driver = conn.driver with db.buffered_db_writer(driver, conn, result_table, result_column_names, 100, hdfs_namenode_addr=hdfs_namenode_addr, hive_location=hive_location, hdfs_user=hdfs_user, hdfs_pass=hdfs_pass) as w: while True: line = feature_file_read.readline() if not line: break row = [i.split(":")[1] for i in line.replace("\n", "").split("\t")] row.append(str(preds[line_no])) w.write(row) line_no += 1 print("Done predicting. Predict table : %s" % result_table)
def keras_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, label_meta, datasource, select, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass): classifier = estimator(**model_params) classifier_pkg = sys.modules[estimator.__module__] conn = connect_with_data_source(datasource) def eval_input_fn(batch_size, cache=False): feature_types = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) gen = db_generator(conn.driver, conn, select, feature_column_names, label_meta["feature_name"], feature_metas) dataset = tf.data.Dataset.from_generator( gen, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"]))) ds_mapper = functools.partial( parse_sparse_feature, feature_column_names=feature_column_names, feature_metas=feature_metas) dataset = dataset.map(ds_mapper).batch(batch_size) if cache: dataset = dataset.cache() return dataset # NOTE: always use batch_size=1 when predicting to get the pairs of features and predict results # to insert into result table. pred_dataset = eval_input_fn(1) one_batch = next(iter(pred_dataset)) # NOTE: must run predict one batch to initialize parameters # see: https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models classifier.predict_on_batch(one_batch[0]) classifier.load_weights(save) pred_dataset = eval_input_fn(1, cache=True).make_one_shot_iterator() buff_rows = [] column_names = feature_column_names[:] column_names.append(label_meta["feature_name"]) with buffered_db_writer(conn.driver, conn, result_table, column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: for features in pred_dataset: result = classifier.predict_on_batch(features[0]) result = classifier_pkg.prepare_prediction_column(result[0]) row = [] for idx, name in enumerate(feature_column_names): val = features[0][name].numpy()[0][0] row.append(str(val)) row.append(str(result)) w.write(row) del pred_dataset
def pred(datasource, select, feature_field_meta, label_field_meta, result_table, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass=""): conn = connect_with_data_source(datasource) feature_column_names = [k["name"] for k in feature_field_meta] label_name = label_field_meta["name"] feature_specs = {k['name']: k for k in feature_field_meta} dpred = xgb_dataset(conn, 'predict.txt', select, feature_column_names, label_name, feature_specs) bst = xgb.Booster({'nthread': 4}) # init model bst.load_model("my_model") # load data preds = bst.predict(dpred) # TODO(Yancey1989): using the train parameters to decide regression model or classifier model if len(preds.shape) == 2: # classifier result preds = np.argmax(np.array(preds), axis=1) feature_file_read = open("predict.txt", "r") result_column_names = feature_column_names result_column_names.append(label_name) line_no = 0 with buffered_db_writer(conn.driver, conn, result_table, result_column_names, 100, hdfs_namenode_addr=hdfs_namenode_addr, hive_location=hive_location, hdfs_user=hdfs_user, hdfs_pass=hdfs_pass) as w: while True: line = feature_file_read.readline() if not line: break row = [ i.split(":")[1] for i in line.replace("\n", "").split("\t")[1:] ] row.append(str(preds[line_no])) w.write(row) line_no += 1 print("Done predicting. Predict table : %s" % result_table)
def write_result_metrics(result_metrics, metric_name_list, result_table, driver, conn, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass): # NOTE: assume that the result table is already created with columns: # loss | metric_names ... column_names = metric_name_list with buffered_db_writer(driver, conn, result_table, column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: row = [] for key in metric_name_list: row.append(result_metrics[key]) w.write(row)
def predict_and_store_result(bst, dpred, feature_file_id, model_params, feature_column_names, label_name, is_pai, conn, result_table, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass): preds = bst.predict(dpred) #TODO(yancey1989): should save train_params and model_params not only on PAI submitter #TODO(yancey1989): output the original result for various objective function. if model_params: obj = model_params["objective"] if obj.startswith("binary:"): preds = (preds > 0.5).astype(int) elif obj.startswith("multi:"): preds = np.argmax(np.array(preds), axis=1) else: # using the original prediction result of predict API by default pass else: # prediction output wiht multi-class job has two dimensions, this is a temporary # way, can remove this else branch when we can load the model meta not only on PAI submitter. if len(preds.shape) == 2: preds = np.argmax(np.array(preds), axis=1) if is_pai: feature_file_read = open("predict.txt", "r") else: feature_file_read = open("predict.txt_%d" % feature_file_id, "r") result_column_names = feature_column_names result_column_names.append(label_name) line_no = 0 if is_pai: driver = "pai_maxcompute" else: driver = conn.driver with db.buffered_db_writer(driver, conn, result_table, result_column_names, 100, hdfs_namenode_addr=hdfs_namenode_addr, hive_location=hive_location, hdfs_user=hdfs_user, hdfs_pass=hdfs_pass) as w: while True: line = feature_file_read.readline() if not line: break row = [i.split(":")[1] for i in line.replace("\n", "").split("\t")] row.append(str(preds[line_no])) w.write(row) line_no += 1
def keras_predict(estimator, model_params, save, result_table, is_pai, pai_table, feature_column_names, feature_metas, result_col_name, datasource, select, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass): classifier = estimator(**model_params) classifier_pkg = sys.modules[estimator.__module__] conn = db.connect_with_data_source(datasource) def eval_input_fn(batch_size, cache=False): feature_types = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) if is_pai: pai_table_parts = pai_table.split(".") formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) gen = db.pai_maxcompute_db_generator(formatted_pai_table, feature_column_names, None, feature_metas) else: gen = db.db_generator(conn.driver, conn, select, feature_column_names, None, feature_metas) dataset = tf.data.Dataset.from_generator(gen, (tuple(feature_types), )) ds_mapper = functools.partial( parse_sparse_feature_predict, feature_column_names=feature_column_names, feature_metas=feature_metas) dataset = dataset.map(ds_mapper).batch(batch_size) if cache: dataset = dataset.cache() return dataset # NOTE: always use batch_size=1 when predicting to get the pairs of features and predict results # to insert into result table. pred_dataset = eval_input_fn(1) one_batch = next(iter(pred_dataset)) # NOTE: must run predict one batch to initialize parameters # see: https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models classifier.predict_on_batch(one_batch) classifier.load_weights(save) pred_dataset = eval_input_fn(1, cache=True).make_one_shot_iterator() buff_rows = [] column_names = feature_column_names[:] column_names.append(result_col_name) with db.buffered_db_writer(conn.driver, conn, result_table, column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: for features in pred_dataset: result = classifier.predict_on_batch(features) result = classifier_pkg.prepare_prediction_column(result[0]) row = [] for idx, name in enumerate(feature_column_names): val = features[name].numpy()[0][0] row.append(str(val)) if isinstance(result, np.ndarray): if len(result) > 1: # NOTE(typhoonzero): if the output dimension > 1, format output tensor # using a comma separated string. Only available for keras models. row.append(",".join([str(i) for i in result])) else: row.append(str(result[0])) else: row.append(str(result)) w.write(row) del pred_dataset
def estimator_predict(estimator, model_params, save, result_table, feature_column_names, feature_column_names_map, feature_columns, feature_metas, result_col_name, datasource, select, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass, is_pai, pai_table): if not is_pai: conn = db.connect_with_data_source(datasource) column_names = feature_column_names[:] column_names.append(result_col_name) if is_pai: driver = "pai_maxcompute" conn = None pai_table_parts = pai_table.split(".") formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) predict_generator = db.pai_maxcompute_db_generator( formatted_pai_table, feature_column_names, None, feature_metas)() else: driver = conn.driver predict_generator = db.db_generator(conn.driver, conn, select, feature_column_names, None, feature_metas)() # load from the exported model with open("exported_path", "r") as fn: export_path = fn.read() if tf_is_version2(): imported = tf.saved_model.load(export_path) else: imported = tf.saved_model.load_v2(export_path) def add_to_example(example, x, i): feature_name = feature_column_names[i] dtype_str = feature_metas[feature_name]["dtype"] if feature_metas[feature_name]["delimiter"] != "": if feature_metas[feature_name]["is_sparse"]: # NOTE(typhoonzero): sparse feature will get (indices,values,shape) here, use indices only values = x[0][i][0].flatten() else: values = x[0][i].flatten() if dtype_str == "float32" or dtype_str == "float64": example.features.feature[feature_name].float_list.value.extend( list(values)) elif dtype_str == "int32" or dtype_str == "int64": example.features.feature[feature_name].int64_list.value.extend( list(values)) else: if "feature_columns" in feature_columns: idx = feature_column_names.index(feature_name) fc = feature_columns["feature_columns"][idx] else: # DNNLinearCombinedXXX have dnn_feature_columns and linear_feature_columns param. idx = -1 try: idx = feature_column_names_map[ "dnn_feature_columns"].index(feature_name) fc = feature_columns["dnn_feature_columns"][idx] except: try: idx = feature_column_names_map[ "linear_feature_columns"].index(feature_name) fc = feature_columns["linear_feature_columns"][idx] except: pass if idx == -1: raise ValueError( "can not found feature %s in all feature columns") if dtype_str == "float32" or dtype_str == "float64": # need to pass a tuple(float, ) example.features.feature[feature_name].float_list.value.extend( (float(x[0][i][0]), )) elif dtype_str == "int32" or dtype_str == "int64": numeric_type = type(tf.feature_column.numeric_column("tmp")) if type(fc) == numeric_type: example.features.feature[ feature_name].float_list.value.extend( (float(x[0][i][0]), )) else: example.features.feature[ feature_name].int64_list.value.extend( (int(x[0][i][0]), )) elif dtype_str == "string": example.features.feature[feature_name].bytes_list.value.extend( x[0][i]) def predict(x): example = tf.train.Example() for i in range(len(feature_column_names)): add_to_example(example, x, i) return imported.signatures["predict"]( examples=tf.constant([example.SerializeToString()])) with db.buffered_db_writer(driver, conn, result_table, column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: for features in predict_generator: result = predict(features) row = [] for idx, _ in enumerate(feature_column_names): per_feature = features[0][idx] if isinstance(per_feature, tuple) or isinstance( per_feature, list): # is sparse feature: tuple (indices, values, shape) or scalar val = per_feature[0] elif isinstance(per_feature, np.ndarray): val = per_feature # val = features[0][idx][0] row.append(str(val)) if "class_ids" in result: row.append(str(result["class_ids"].numpy()[0][0])) else: # regression predictions row.append(str(result["predictions"].numpy()[0][0])) w.write(row)
def evaluate_and_store_result(bst, dpred, feature_file_id, validation_metrics, model_params, feature_column_names, label_meta, is_pai, conn, result_table, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass): preds = bst.predict(dpred) # FIXME(typhoonzero): copied from predict.py if model_params: obj = model_params["objective"] if obj.startswith("binary:"): preds = (preds > 0.5).astype(int) elif obj.startswith("multi:"): preds = np.argmax(np.array(preds), axis=1) else: # using the original prediction result of predict API by default pass else: # prediction output with multi-class job has two dimensions, this is a temporary # way, can remove this else branch when we can load the model meta not only on PAI submitter. if len(preds.shape) == 2: preds = np.argmax(np.array(preds), axis=1) if is_pai: feature_file_read = open("predict.txt", "r") else: feature_file_read = open("predict.txt_%d" % feature_file_id, "r") y_test_list = [] for line in feature_file_read: row = [i for i in line.strip().split("\t")] # DMatrix store label in the first column if label_meta["dtype"] == "float32": label = float(row[0]) elif label_meta["dtype"] == "int64" or label_meta["dtype"] == "int32": label = int(row[0]) else: raise ValueError("unsupported label dtype: %s" % label_meta["dtype"]) y_test_list.append(label) y_test = np.array(y_test_list) evaluate_results = dict() for metric_name in validation_metrics: metric_func = eval(metric_name) metric_value = metric_func(y_test, preds) evaluate_results[metric_name] = metric_value # write evaluation result to result table if is_pai: driver = "pai_maxcompute" else: driver = conn.driver result_columns = ["loss"] + validation_metrics with db.buffered_db_writer(driver, conn, result_table, result_columns, 100, hdfs_namenode_addr=hdfs_namenode_addr, hive_location=hive_location, hdfs_user=hdfs_user, hdfs_pass=hdfs_pass) as w: row = ["0.0"] for mn in validation_metrics: row.append(str(evaluate_results[mn])) w.write(row)
def estimator_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, result_col_name, datasource, select, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass, is_pai, pai_table): if not is_pai: conn = connect_with_data_source(datasource) column_names = feature_column_names[:] column_names.append(result_col_name) if is_pai: driver = "pai_maxcompute" conn = None pai_table_parts = pai_table.split(".") formated_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) predict_generator = pai_maxcompute_db_generator( formated_pai_table, feature_column_names, None, feature_metas)() else: driver = conn.driver predict_generator = db_generator(conn.driver, conn, select, feature_column_names, None, feature_metas)() # load from the exported model if save.startswith("oss://"): with open("exported_path", "r") as fn: export_path = fn.read() parts = save.split("?") export_path_oss = parts[0] + export_path if TF_VERSION_2: imported = tf.saved_model.load(export_path_oss) else: imported = tf.saved_model.load_v2(export_path_oss) else: with open("exported_path", "r") as fn: export_path = fn.read() if TF_VERSION_2: imported = tf.saved_model.load(export_path) else: imported = tf.saved_model.load_v2(export_path) def add_to_example(example, x, i): feature_name = feature_column_names[i] dtype_str = feature_metas[feature_name]["dtype"] if feature_metas[feature_name]["delimiter"] != "": if feature_metas[feature_name]["is_sparse"]: # NOTE(typhoonzero): sparse feature will get (indices,values,shape) here, use indices only values = x[0][i][0].flatten() else: values = x[0][i].flatten() if dtype_str == "float32" or dtype_str == "float64": example.features.feature[feature_name].float_list.value.extend( list(values)) elif dtype_str == "int32" or dtype_str == "int64": example.features.feature[feature_name].int64_list.value.extend( list(values)) else: if dtype_str == "float32" or dtype_str == "float64": # need to pass a tuple(float, ) example.features.feature[feature_name].float_list.value.extend( (float(x[0][i][0]), )) elif dtype_str == "int32" or dtype_str == "int64": # FIXME(typhoonzero): figure out why int64 features need to convert to float example.features.feature[feature_name].float_list.value.extend( (float(x[0][i][0]), )) elif dtype_str == "string": example.features.feature[feature_name].bytes_list.value.extend( x[0][i]) def predict(x): example = tf.train.Example() for i in range(len(feature_column_names)): add_to_example(example, x, i) return imported.signatures["predict"]( examples=tf.constant([example.SerializeToString()])) with buffered_db_writer(driver, conn, result_table, column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: for features in predict_generator: result = predict(features) row = [] for idx, _ in enumerate(feature_column_names): per_feature = features[0][idx] if isinstance(per_feature, tuple) or isinstance( per_feature, list): # is sparse feature: tuple (indices, values, shape) or scalar val = per_feature[0] elif isinstance(per_feature, np.ndarray): val = per_feature # val = features[0][idx][0] row.append(str(val)) if "class_ids" in result: row.append(str(result["class_ids"].numpy()[0][0])) else: # regression predictions row.append(str(result["predictions"].numpy()[0][0])) w.write(row)
def predict_and_store_result(bst, dpred, feature_file_id, model_params, selected_cols, label_name, is_pai, conn, result_table, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass): preds = bst.predict(dpred) #TODO(yancey1989): should save train_params and model_params not only on PAI submitter #TODO(yancey1989): output the original result for various objective function. if model_params: obj = model_params["objective"] if obj.startswith("binary:"): preds = (preds > 0.5).astype(int) elif obj.startswith("multi:"): preds = np.argmax(np.array(preds), axis=1) else: # using the original prediction result of predict API by default pass else: # prediction output with multi-class job has two dimensions, this is a temporary # way, can remove this else branch when we can load the model meta not only on PAI submitter. if len(preds.shape) == 2: preds = np.argmax(np.array(preds), axis=1) if is_pai: feature_file_read = open("predict.txt.raw", "r") else: feature_file_read = open( "predict.raw.dir/predict.txt_%d" % feature_file_id, "r") result_column_names = selected_cols # Users may use "SELECT ..., label ... TO PREDICT new_table.new_label" to # write both the actual label and the prediction label into the result # table for comparision. So if "new_label == label", we should use # "INSERT INTO new_table (..., label) VALUES ..." to write the result table, # and if new_label != label, we should use # "INSERT INTO new_table (..., label, new_label) VALUES..." to write the result table. # "new_label == label" is equivalent to "label_name in selected_cols" . label_index = selected_cols.index( label_name) if label_name in selected_cols else None if label_index is None: result_column_names.append(label_name) line_no = 0 if is_pai: driver = "pai_maxcompute" else: driver = conn.driver with db.buffered_db_writer(driver, conn, result_table, result_column_names, 100, hdfs_namenode_addr=hdfs_namenode_addr, hive_location=hive_location, hdfs_user=hdfs_user, hdfs_pass=hdfs_pass) as w: while True: line = feature_file_read.readline() if not line: break row = [ item.split(":")[1] for i, item in enumerate(line.strip().split("\t")) if i != label_index ] row.append(str(preds[line_no])) w.write(row) line_no += 1
def pred(is_keras_model, datasource, estimator, select, result_table, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, save="", batch_size=1, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass=""): conn = connect_with_data_source(datasource) model_params.update(feature_columns) if not is_keras_model: model_params['model_dir'] = save classifier = estimator(**model_params) else: if not issubclass(estimator, tf.keras.Model): # functional model need field_metas parameter model_params["field_metas"] = feature_metas classifier = estimator(**model_params) classifier_pkg = sys.modules[estimator.__module__] if is_keras_model: def eval_input_fn(batch_size, cache=False): feature_types = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) else: feature_types.append( get_dtype(feature_metas[name]["dtype"])) gen = db_generator(conn.driver, conn, select, feature_column_names, label_meta["feature_name"], feature_metas) dataset = tf.data.Dataset.from_generator( gen, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"]))) ds_mapper = functools.partial( parse_sparse_feature, feature_column_names=feature_column_names, feature_metas=feature_metas) dataset = dataset.map(ds_mapper).batch(batch_size) if cache: dataset = dataset.cache() return dataset # NOTE: always use batch_size=1 when predicting to get the pairs of features and predict results # to insert into result table. pred_dataset = eval_input_fn(1) one_batch = pred_dataset.__iter__().next() # NOTE: must run predict one batch to initialize parameters # see: https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models classifier.predict_on_batch(one_batch[0]) classifier.load_weights(save) del pred_dataset pred_dataset = eval_input_fn(1, cache=True).make_one_shot_iterator() buff_rows = [] column_names = feature_column_names[:] column_names.append(label_meta["feature_name"]) with buffered_db_writer(conn.driver, conn, result_table, column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: while True: try: features = pred_dataset.get_next() except tf.errors.OutOfRangeError: break result = classifier.predict_on_batch(features[0]) result = classifier_pkg.prepare_prediction_column(result[0]) row = [] for idx, name in enumerate(feature_column_names): val = features[0][name].numpy()[0] row.append(str(val)) row.append(str(result)) w.write(row) del pred_dataset else: def fast_input_fn(generator): feature_types = [] for name in feature_column_names: if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) else: feature_types.append( get_dtype(feature_metas[name]["dtype"])) def _inner_input_fn(): dataset = tf.data.Dataset.from_generator( generator, (tuple(feature_types), eval( "tf.%s" % label_meta["dtype"]))) ds_mapper = functools.partial( parse_sparse_feature, feature_column_names=feature_column_names, feature_metas=feature_metas) dataset = dataset.map(ds_mapper).batch(1).cache() iterator = dataset.make_one_shot_iterator() features = iterator.get_next() return features return _inner_input_fn column_names = feature_column_names[:] column_names.append(label_meta["feature_name"]) pred_gen = db_generator(conn.driver, conn, select, feature_column_names, label_meta["feature_name"], feature_metas)() fast_predictor = FastPredict(classifier, fast_input_fn) with buffered_db_writer(conn.driver, conn, result_table, column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: while True: try: features = next(pred_gen) except StopIteration: break result = fast_predictor.predict(features) row = [] for idx, _ in enumerate(feature_column_names): val = features[0][idx] row.append(str(val)) if "class_ids" in list(result)[0]: row.append(str(list(result)[0]["class_ids"][0])) else: # regression predictions row.append(str(list(result)[0]["predictions"][0])) w.write(row) fast_predictor.close() print("Done predicting. Predict table : %s" % result_table)
def pred(is_keras_model, datasource, estimator, select, result_table, feature_columns, feature_column_names, feature_metas={}, label_meta={}, model_params={}, save="", batch_size=1, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass="", is_pai=False, pai_table=""): global FLAGS define_tf_flags() if not is_pai: conn = connect_with_data_source(datasource) model_params.update(feature_columns) if is_keras_model: if not issubclass(estimator, tf.keras.Model): # functional model need field_metas parameter model_params["field_metas"] = feature_metas classifier = estimator(**model_params) classifier_pkg = sys.modules[estimator.__module__] def eval_input_fn(batch_size, cache=False): feature_types = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) gen = db_generator(conn.driver, conn, select, feature_column_names, label_meta["feature_name"], feature_metas) dataset = tf.data.Dataset.from_generator(gen, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"]))) ds_mapper = functools.partial(parse_sparse_feature, feature_column_names=feature_column_names, feature_metas=feature_metas) dataset = dataset.map(ds_mapper).batch(batch_size) if cache: dataset = dataset.cache() return dataset # NOTE: always use batch_size=1 when predicting to get the pairs of features and predict results # to insert into result table. pred_dataset = eval_input_fn(1) one_batch = pred_dataset.__iter__().next() # NOTE: must run predict one batch to initialize parameters # see: https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models classifier.predict_on_batch(one_batch[0]) classifier.load_weights(save) del pred_dataset pred_dataset = eval_input_fn(1, cache=True).make_one_shot_iterator() buff_rows = [] column_names = feature_column_names[:] column_names.append(label_meta["feature_name"]) with buffered_db_writer(conn.driver, conn, result_table, column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: while True: try: features = pred_dataset.get_next() except tf.errors.OutOfRangeError: break result = classifier.predict_on_batch(features[0]) result = classifier_pkg.prepare_prediction_column(result[0]) row = [] for idx, name in enumerate(feature_column_names): val = features[0][name].numpy()[0] row.append(str(val)) row.append(str(result)) w.write(row) del pred_dataset else: if is_pai: model_params["model_dir"] = FLAGS.checkpointDir else: model_params['model_dir'] = save classifier = estimator(**model_params) # FIXME(typhoonzero): copied from train.py def pai_maxcompute_input_fn(): table_parts = pai_table.split(".") if len(table_parts) == 2: database, table_name = table_parts elif len(table_parts) == 1: table_name = pai_table driver, dsn = datasource.split("://") database = parseMaxComputeDSN(dsn)[-1] else: raise ValueError("error database.table format: %s" % pai_table) tables = ["odps://%s/tables/%s" % (database, table_name)] record_defaults = [] for name in feature_column_names: dtype = get_dtype(feature_metas[name]["dtype"]) record_defaults.append(tf.constant(0, dtype=dtype, shape=feature_metas[name]["shape"])) dataset = tf.data.TableRecordDataset(tables, record_defaults=record_defaults, selected_cols=",".join(feature_column_names)) def tensor_to_dict(*args): num_features = len(feature_column_names) features_dict = dict() for idx in range(num_features): name = feature_column_names[idx] features_dict[name] = tf.reshape(args[idx], [-1]) return features_dict return dataset.map(tensor_to_dict) def fast_input_fn(generator): feature_types = [] for name in feature_column_names: if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) def _inner_input_fn(): if is_pai: dataset = pai_maxcompute_input_fn() else: dataset = tf.data.Dataset.from_generator(generator, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"]))) ds_mapper = functools.partial(parse_sparse_feature, feature_column_names=feature_column_names, feature_metas=feature_metas) dataset = dataset.map(ds_mapper) dataset = dataset.batch(1).cache() iterator = dataset.make_one_shot_iterator() features = iterator.get_next() return features return _inner_input_fn column_names = feature_column_names[:] column_names.append(label_meta["feature_name"]) pred_gen = db_generator(conn.driver, conn, select, feature_column_names, label_meta["feature_name"], feature_metas)() fast_predictor = FastPredict(classifier, fast_input_fn) with buffered_db_writer(conn.driver, conn, result_table, column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: while True: try: features = next(pred_gen) except StopIteration: break result = fast_predictor.predict(features) row = [] for idx, _ in enumerate(feature_column_names): val = features[0][idx] row.append(str(val)) if "class_ids" in list(result)[0]: row.append(str(list(result)[0]["class_ids"][0])) else: # regression predictions row.append(str(list(result)[0]["predictions"][0])) w.write(row) fast_predictor.close() print("Done predicting. Predict table : %s" % result_table)