def pai_download_table_data_worker(dname, feature_metas, feature_column_names, label_meta, pai_table, slice_id, slice_count, feature_column_code, raw_data_dir): import runtime.xgboost as xgboost_extended feature_column_transformers = eval('[{}]'.format(feature_column_code)) transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *feature_column_transformers) label_column_name = label_meta['feature_name'] if label_meta else None gen = db.pai_maxcompute_db_generator(pai_table, label_column_name, slice_id=slice_id, slice_count=slice_count)() selected_cols = db.pai_selected_cols(pai_table) filename = "{}/{}.txt".format(dname, slice_id) dump_dmatrix(filename, gen, feature_column_names, feature_metas, label_meta, selected_cols, transform_fn=transform_fn, raw_data_dir=raw_data_dir)
def xgb_shap_dataset(datasource, select, feature_column_names, label_meta, feature_metas, is_pai, pai_explain_table, transform_fn=None, feature_column_code=""): label_column_name = label_meta["feature_name"] if is_pai: pai_table_parts = pai_explain_table.split(".") formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) stream = db.pai_maxcompute_db_generator(formatted_pai_table, label_column_name) selected_cols = db.pai_selected_cols(formatted_pai_table) else: conn = db.connect_with_data_source(datasource) stream = db.db_generator(conn, select, label_meta) selected_cols = db.selected_cols(conn, select) if transform_fn: feature_names = transform_fn.get_feature_column_names() else: feature_names = feature_column_names xs = None dtypes = [] sizes = [] offsets = [] i = 0 for row, label in stream(): features = db.read_features_from_row(row, selected_cols, feature_column_names, feature_metas) if transform_fn: features = transform_fn(features) flatten_features = [] for j, feature in enumerate(features): if len(feature) == 3: # convert sparse to dense col_indices, values, dense_shape = feature size = int(np.prod(dense_shape)) row_indices = np.zeros(shape=[col_indices.size]) sparse_matrix = scipy.sparse.csr_matrix( (values, (row_indices, col_indices)), shape=[1, size]) values = sparse_matrix.toarray() else: values = feature[0] if isinstance(values, np.ndarray): flatten_features.extend(values.flatten().tolist()) if i == 0: sizes.append(values.size) dtypes.append(infer_dtype(values)) else: flatten_features.append(values) if i == 0: sizes.append(1) dtypes.append(infer_dtype(values)) # Create the column name according to the feature number # of each column. # # If the column "c" contains only 1 feature, the result # column name would be "c" too. # # If the column "c" contains 3 features, # the result column name would be "c-0", "c-1" and "c-2" if i == 0: offsets = np.cumsum([0] + sizes) column_names = [] for j in six.moves.range(len(offsets) - 1): start = offsets[j] end = offsets[j + 1] if end - start == 1: column_names.append(feature_names[j]) else: for k in six.moves.range(start, end): column_names.append('{}-{}'.format( feature_names[j], k)) xs = pd.DataFrame(columns=column_names) xs.loc[i] = flatten_features i += 1 # NOTE(typhoonzero): set dtype to the feature's actual type, or the dtype # may be "object". Use below code to reproduce: # import pandas as pd # feature_column_names=["a", "b"] # xs = pd.DataFrame(columns=feature_column_names) # for i in range(10): # xs.loc[i] = [int(j) for j in range(2)] # print(xs.dtypes) columns = xs.columns for i, dtype in enumerate(dtypes): for j in six.moves.range(offsets[i], offsets[i + 1]): xs[columns[j]] = xs[columns[j]].astype(dtype) return xs
def estimator_predict(estimator, model_params, save, result_table, feature_column_names, feature_column_names_map, feature_columns, feature_metas, train_label_name, result_col_name, datasource, select, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass, is_pai, pai_table): if not is_pai: conn = db.connect_with_data_source(datasource) if is_pai: driver = "pai_maxcompute" conn = None pai_table_parts = pai_table.split(".") formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) selected_cols = db.pai_selected_cols(formatted_pai_table) predict_generator = db.pai_maxcompute_db_generator( formatted_pai_table)() else: driver = conn.driver # bypass all selected cols to the prediction result table selected_cols = db.selected_cols(conn, select) predict_generator = db.db_generator(conn, select)() write_cols = selected_cols[:] try: train_label_index = selected_cols.index(train_label_name) except ValueError: train_label_index = -1 if train_label_index != -1: del write_cols[train_label_index] write_cols.append(result_col_name) # load from the exported model with open("exported_path", "r") as fn: export_path = fn.read() if tf_is_version2(): imported = tf.saved_model.load(export_path) else: imported = tf.saved_model.load_v2(export_path) def add_to_example(example, x, i): feature_name = feature_column_names[i] dtype_str = feature_metas[feature_name]["dtype"] if feature_metas[feature_name]["delimiter"] != "": if feature_metas[feature_name]["is_sparse"]: # NOTE(typhoonzero): sparse feature will get (indices,values,shape) here, use indices only values = x[0][i][0].flatten() else: values = x[0][i].flatten() if dtype_str == "float32" or dtype_str == "float64": example.features.feature[feature_name].float_list.value.extend( list(values)) elif dtype_str == "int32" or dtype_str == "int64": example.features.feature[feature_name].int64_list.value.extend( list(values)) else: if "feature_columns" in feature_columns: idx = feature_column_names.index(feature_name) fc = feature_columns["feature_columns"][idx] else: # DNNLinearCombinedXXX have dnn_feature_columns and linear_feature_columns param. idx = -1 try: idx = feature_column_names_map[ "dnn_feature_columns"].index(feature_name) fc = feature_columns["dnn_feature_columns"][idx] except: try: idx = feature_column_names_map[ "linear_feature_columns"].index(feature_name) fc = feature_columns["linear_feature_columns"][idx] except: pass if idx == -1: raise ValueError( "can not found feature %s in all feature columns") if dtype_str == "float32" or dtype_str == "float64": # need to pass a tuple(float, ) example.features.feature[feature_name].float_list.value.extend( (float(x[0][i][0]), )) elif dtype_str == "int32" or dtype_str == "int64": numeric_type = type(tf.feature_column.numeric_column("tmp")) if type(fc) == numeric_type: example.features.feature[ feature_name].float_list.value.extend( (float(x[0][i][0]), )) else: example.features.feature[ feature_name].int64_list.value.extend( (int(x[0][i][0]), )) elif dtype_str == "string": example.features.feature[feature_name].bytes_list.value.extend( x[0][i]) def predict(x): example = tf.train.Example() for i in range(len(feature_column_names)): add_to_example(example, x, i) return imported.signatures["predict"]( examples=tf.constant([example.SerializeToString()])) with db.buffered_db_writer(driver, conn, result_table, write_cols, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: for row, _ in predict_generator: features = db.read_features_from_row(row, selected_cols, feature_column_names, feature_metas) result = predict((features, )) if train_label_index != -1 and len(row) > train_label_index: del row[train_label_index] if "class_ids" in result: row.append(str(result["class_ids"].numpy()[0][0])) else: # regression predictions row.append(str(result["predictions"].numpy()[0][0])) w.write(row)
def keras_predict(estimator, model_params, save, result_table, is_pai, pai_table, feature_column_names, feature_metas, train_label_name, result_col_name, datasource, select, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass): classifier = init_model_with_feature_column(estimator, model_params) classifier_pkg = sys.modules[estimator.__module__] conn = None if is_pai: driver = "pai_maxcompute" else: conn = db.connect_with_data_source(datasource) driver = conn.driver if is_pai: pai_table_parts = pai_table.split(".") formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) gen = db.pai_maxcompute_db_generator(formatted_pai_table) selected_cols = feature_column_names else: gen = db.db_generator(conn, select) selected_cols = db.selected_cols(conn, select) def eval_input_fn(batch_size, cache=False): feature_types = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) tf_gen = tf_generator(gen, selected_cols, feature_column_names, feature_metas) dataset = tf.data.Dataset.from_generator(tf_gen, (tuple(feature_types), )) ds_mapper = functools.partial( parse_sparse_feature_predict, feature_column_names=feature_column_names, feature_metas=feature_metas) dataset = dataset.map(ds_mapper).batch(batch_size) if cache: dataset = dataset.cache() return dataset # NOTE: always use batch_size=1 when predicting to get the pairs of # features and predict results to insert into result table. pred_dataset = eval_input_fn(1) one_batch = next(iter(pred_dataset)) # NOTE: must run predict one batch to initialize parameters # see: https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models classifier.predict_on_batch(one_batch) classifier.load_weights(save) pred_dataset = eval_input_fn(1, cache=True).make_one_shot_iterator() column_names = selected_cols[:] train_label_index = selected_cols.index(train_label_name) if train_label_index != -1: del column_names[train_label_index] column_names.append(result_col_name) with db.buffered_db_writer(driver, conn, result_table, column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: for features in pred_dataset: result = classifier.predict_on_batch(features) # FIXME(typhoonzero): determine the predict result is classification by # adding the prediction result together to see if it is close to 1.0. if len(result[0]) == 1: # regression result result = result[0][0] else: sum = 0 for i in result[0]: sum += i if np.isclose(sum, 1.0): # classification result result = result[0].argmax(axis=-1) else: result = result[0] # multiple regression result row = [] for idx, name in enumerate(feature_column_names): val = features[name].numpy()[0][0] row.append(str(val)) if isinstance(result, np.ndarray): if len(result) > 1: # NOTE(typhoonzero): if the output dimension > 1, format output tensor # using a comma separated string. Only available for keras models. row.append(",".join([str(i) for i in result])) else: row.append(str(result[0])) else: row.append(str(result)) w.write(row) del pred_dataset
def _predict(datasource, estimator_string, select, result_table, feature_columns, feature_column_names, feature_column_names_map, train_label_name, result_col_name, feature_metas={}, model_params={}, save="", batch_size=1, pai_table=""): estimator = import_model(estimator_string) model_params.update(feature_columns) is_estimator = is_tf_estimator(estimator) conn = None driver = "pai_maxcompute" pai_table_parts = pai_table.split(".") formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) selected_cols = db.pai_selected_cols(formatted_pai_table) predict_generator = db.pai_maxcompute_db_generator(formatted_pai_table) if not is_estimator: if not issubclass(estimator, tf.keras.Model): # functional model need field_metas parameter model_params["field_metas"] = feature_metas print("Start predicting using keras model...") keras_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, train_label_name, result_col_name, driver, conn, predict_generator, selected_cols, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass="") else: model_params['model_dir'] = save print("Start predicting using estimator model...") estimator_predict(estimator, model_params, save, result_table, feature_column_names, feature_column_names_map, feature_columns, feature_metas, train_label_name, result_col_name, driver, conn, predict_generator, selected_cols, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass="") print("Done predicting. Predict table : %s" % result_table)