def pred(datasource, estimator_string, select, result_table, feature_columns, feature_column_names, feature_column_names_map, train_label_name, result_col_name, feature_metas={}, model_params={}, pred_params={}, save="", batch_size=1, pai_table=""): estimator = import_model(estimator_string) model_params.update(feature_columns) is_estimator = is_tf_estimator(estimator) if pai_table != "": conn = PaiIOConnection.from_table(pai_table) selected_cols = db.selected_cols(conn, None) predict_generator = db.db_generator(conn, None) else: conn = db.connect_with_data_source(datasource) selected_cols = db.selected_cols(conn, select) predict_generator = db.db_generator(conn, select) pop_optimizer_and_loss(model_params) if pred_params is None: extra_result_cols = [] else: extra_result_cols = pred_params.get("extra_outputs", "") extra_result_cols = [ c.strip() for c in extra_result_cols.split(",") if c.strip() ] if not is_estimator: if not issubclass(estimator, tf.keras.Model): # functional model need field_metas parameter model_params["field_metas"] = feature_metas print("Start predicting using keras model...") keras_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, train_label_name, result_col_name, conn, predict_generator, selected_cols, extra_result_cols) else: # TODO(sneaxiy): support extra_result_cols for estimator model_params['model_dir'] = save print("Start predicting using estimator model...") estimator_predict(result_table, feature_column_names, feature_metas, train_label_name, result_col_name, conn, predict_generator, selected_cols) print("Done predicting. Predict table : %s" % result_table)
def pai_download_table_data_worker(dname, feature_metas, feature_column_names, label_meta, pai_table, slice_id, slice_count, feature_column_code, raw_data_dir): import runtime.xgboost as xgboost_extended if isinstance(feature_column_code, dict): # NOTE(typhoonzero): feature_column_code is a dict of # runtime.feature.column in refactored step code. feature_column_transformers = compile_ir_feature_columns( feature_column_code, EstimatorType.XGBOOST) transform_fn = \ xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *feature_column_transformers["feature_columns"]) else: feature_column_transformers = eval('[{}]'.format(feature_column_code)) transform_fn = \ xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *feature_column_transformers) conn = PaiIOConnection.from_table(pai_table, slice_id, slice_count) gen = db.db_generator(conn, None, label_meta=label_meta)() selected_cols = db.selected_cols(conn, None) filename = "{}/{}.txt".format(dname, slice_id) dump_dmatrix(filename, gen, feature_column_names, feature_metas, label_meta, selected_cols, transform_fn=transform_fn, raw_data_dir=raw_data_dir)
def pred(datasource, select, feature_metas, feature_column_names, train_label_meta, pred_label_meta, result_table, is_pai=False, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass="", pai_table="", model_params=None, train_params=None, transform_fn=None, feature_column_code=""): if not is_pai: conn = db.connect_with_data_source(datasource) else: conn = None dpred = xgb_dataset( datasource=datasource, fn='predict.txt', dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=None, is_pai=is_pai, pai_table=pai_table, pai_single_file=True, cache=True, batch_size=DEFAULT_PREDICT_BATCH_SIZE, transform_fn=transform_fn, feature_column_code=feature_column_code, raw_data_dir="predict.raw.dir") # NOTE: default to use external memory bst = xgb.Booster({'nthread': 4}) # init model bst.load_model("my_model") # load data print("Start predicting XGBoost model...") if is_pai: pai_table = "odps://{}/tables/{}".format(*pai_table.split(".")) selected_cols = db.pai_selected_cols(pai_table) else: selected_cols = db.selected_cols(conn, select) feature_file_id = 0 train_label_name = train_label_meta["feature_name"] pred_label_name = pred_label_meta["feature_name"] for pred_dmatrix in dpred: predict_and_store_result(bst, pred_dmatrix, feature_file_id, model_params, selected_cols, train_label_name, pred_label_name, feature_column_names, feature_metas, is_pai, conn, result_table, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) feature_file_id += 1 print("Done predicting. Predict table : %s" % result_table)
def input_fn(select, datasource, feature_column_names, feature_metas, label_meta, is_pai=False, pai_table="", num_workers=1, worker_id=0): feature_types = [] shapes = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: if feature_metas[name]["delimiter_kv"]: # extract two features from generator data. feature_types.append( (get_dtype(feature_metas[name]["dtype"]), get_dtype(feature_metas[name]["dtype_weight"]), tf.int64)) shapes.append((None, None, None)) else: feature_types.append((tf.int64, tf.int32, tf.int64)) shapes.append((None, None, None)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) shapes.append(feature_metas[name]["shape"]) if is_pai: pai_table = "odps://{}/tables/{}".format(*pai_table.split(".")) return pai_dataset(pai_table, feature_column_names, label_meta, feature_metas, slice_id=worker_id, slice_count=num_workers) else: conn = db.connect_with_data_source(datasource) gen = db.db_generator(conn, select, label_meta) selected_cols = db.selected_cols(conn, select) gen = tf_generator(gen, selected_cols, feature_column_names, feature_metas) # Clustering model do not have label if not label_meta or label_meta["feature_name"] == "": dataset = tf.data.Dataset.from_generator(gen, (tuple(feature_types), ), (tuple(shapes), )) ds_mapper = functools.partial( parse_sparse_feature_predict, feature_column_names=feature_column_names, feature_metas=feature_metas) else: dataset = tf.data.Dataset.from_generator( gen, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"])), (tuple(shapes), label_meta["shape"])) ds_mapper = functools.partial( parse_sparse_feature, feature_column_names=feature_column_names, feature_metas=feature_metas) return dataset.map(ds_mapper)
def pred_imp(datasource, select, feature_metas, feature_column_names, train_label_meta, pred_label_meta, result_table, is_pai=False, pai_table="", model_params=None, train_params=None, transform_fn=None, feature_column_code="", rank=0, nworkers=1): print("rank={} nworkers={}".format(rank, nworkers)) if not is_pai: conn = db.connect_with_data_source(datasource) else: conn = PaiIOConnection.from_table(pai_table) dpred = xgb_dataset( datasource=datasource, fn='predict.txt', dataset_sql=select, feature_metas=feature_metas, feature_column_names=feature_column_names, label_meta=None, is_pai=is_pai, pai_table=pai_table, pai_single_file=True, cache=True, batch_size=DEFAULT_PREDICT_BATCH_SIZE, rank=rank, nworkers=nworkers, transform_fn=transform_fn, feature_column_code=feature_column_code, raw_data_dir="predict.raw.dir") # NOTE: default to use external memory bst = xgb.Booster({'nthread': 4}) # init model bst.load_model("my_model") # load data print("{} Start predicting XGBoost model...".format(datetime.now())) if not model_params: model_params = load_metadata("model_meta.json")["attributes"] selected_cols = db.selected_cols(conn, select) feature_file_id = 0 train_label_name = train_label_meta["feature_name"] pred_label_name = pred_label_meta["feature_name"] for pred_dmatrix in dpred: predict_and_store_result(bst, pred_dmatrix, feature_file_id, model_params, selected_cols, train_label_name, pred_label_name, feature_column_names, feature_metas, is_pai, conn, result_table, rank) feature_file_id += 1 print("{} Done predicting. Predict table: {}".format( datetime.now(), result_table))
def pred(datasource, estimator_string, select, result_table, feature_columns, feature_column_names, feature_column_names_map, train_label_name, result_col_name, feature_metas={}, model_params={}, save="", batch_size=1, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass=""): estimator = import_model(estimator_string) model_params.update(feature_columns) is_estimator = is_tf_estimator(estimator) conn = db.connect_with_data_source(datasource) driver = conn.driver predict_generator = db.db_generator(conn, select) selected_cols = db.selected_cols(conn, select) if not is_estimator: if not issubclass(estimator, tf.keras.Model): # functional model need field_metas parameter model_params["field_metas"] = feature_metas print("Start predicting using keras model...") keras_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, train_label_name, result_col_name, driver, conn, predict_generator, selected_cols, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) else: model_params['model_dir'] = save print("Start predicting using estimator model...") estimator_predict(estimator, model_params, save, result_table, feature_column_names, feature_column_names_map, feature_columns, feature_metas, train_label_name, result_col_name, driver, conn, predict_generator, selected_cols, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) print("Done predicting. Predict table : %s" % result_table)
def pai_download_table_data_worker(dname, feature_metas, feature_column_names, label_meta, pai_table, slice_id, slice_count, feature_column_code, raw_data_dir): import runtime.xgboost as xgboost_extended feature_column_transformers = eval('[{}]'.format(feature_column_code)) transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *feature_column_transformers) conn = PaiIOConnection.from_table(pai_table, slice_id, slice_count) gen = db.db_generator(conn, None, label_meta=label_meta)() selected_cols = db.selected_cols(conn, None) filename = "{}/{}.txt".format(dname, slice_id) dump_dmatrix(filename, gen, feature_column_names, feature_metas, label_meta, selected_cols, transform_fn=transform_fn, raw_data_dir=raw_data_dir)
def _predict(datasource, estimator_string, select, result_table, feature_columns, feature_column_names, feature_column_names_map, train_label_name, result_col_name, feature_metas={}, model_params={}, save="", batch_size=1, pai_table=""): estimator = import_model(estimator_string) model_params.update(feature_columns) is_estimator = is_tf_estimator(estimator) conn = PaiIOConnection.from_table(pai_table) selected_cols = db.selected_cols(conn, None) predict_generator = db.db_generator(conn, None) pop_optimizer_and_loss(model_params) if not is_estimator: if not issubclass(estimator, tf.keras.Model): # functional model need field_metas parameter model_params["field_metas"] = feature_metas print("Start predicting using keras model...") keras_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, train_label_name, result_col_name, conn, predict_generator, selected_cols) else: model_params['model_dir'] = save print("Start predicting using estimator model...") estimator_predict(result_table, feature_column_names, feature_metas, train_label_name, result_col_name, conn, predict_generator, selected_cols) print("Done predicting. Predict table : %s" % result_table)
def xgb_shap_dataset(datasource, select, feature_column_names, label_meta, feature_metas, is_pai, pai_explain_table, transform_fn=None, feature_column_code=""): if is_pai: # (TODO: lhw) we may specify pai_explain_table in datasoure # and discard the condition statement here conn = PaiIOConnection.from_table(pai_explain_table) stream = db.db_generator(conn, None, label_meta) else: conn = db.connect_with_data_source(datasource) stream = db.db_generator(conn, select, label_meta) selected_cols = db.selected_cols(conn, select) if transform_fn: feature_names = transform_fn.get_feature_column_names() else: feature_names = feature_column_names xs = None dtypes = [] sizes = [] offsets = [] i = 0 for row, label in stream(): features = db.read_features_from_row(row, selected_cols, feature_column_names, feature_metas, is_xgboost=True) if transform_fn: features = transform_fn(features) flatten_features = [] for j, feature in enumerate(features): if len(feature) == 3: # convert sparse to dense col_indices, values, dense_shape = feature size = int(np.prod(dense_shape)) row_indices = np.zeros(shape=[col_indices.size]) sparse_matrix = scipy.sparse.csr_matrix( (values, (row_indices, col_indices)), shape=[1, size]) values = sparse_matrix.toarray() else: values = feature[0] if isinstance(values, np.ndarray): flatten_features.extend(values.flatten().tolist()) if i == 0: sizes.append(values.size) dtypes.append(infer_dtype(values)) else: flatten_features.append(values) if i == 0: sizes.append(1) dtypes.append(infer_dtype(values)) # Create the column name according to the feature number # of each column. # # If the column "c" contains only 1 feature, the result # column name would be "c" too. # # If the column "c" contains 3 features, # the result column name would be "c_0", "c_1" and "c_2" if i == 0: offsets = np.cumsum([0] + sizes) column_names = [] for j in six.moves.range(len(offsets) - 1): start = offsets[j] end = offsets[j + 1] if end - start == 1: column_names.append(feature_names[j]) else: for k in six.moves.range(start, end): column_names.append('{}_{}'.format( feature_names[j], k)) xs = pd.DataFrame(columns=column_names) xs.loc[i] = flatten_features i += 1 # NOTE(typhoonzero): set dtype to the feature's actual type, or the dtype # may be "object". Use below code to reproduce: # import pandas as pd # feature_column_names=["a", "b"] # xs = pd.DataFrame(columns=feature_column_names) # for i in range(10): # xs.loc[i] = [int(j) for j in range(2)] # print(xs.dtypes) columns = xs.columns for i, dtype in enumerate(dtypes): for j in six.moves.range(offsets[i], offsets[i + 1]): xs[columns[j]] = xs[columns[j]].astype(dtype) return xs
def xgb_dataset(datasource, fn, dataset_sql, feature_metas, feature_column_names, label_meta, is_pai=False, pai_table="", pai_single_file=False, cache=False, batch_size=None, epoch=1, rank=0, nworkers=1, transform_fn=None, feature_column_code="", raw_data_dir=None): if raw_data_dir: # raw_data_dir is needed when predicting. Because we # should write the raw data from the source db into # the dest db, instead of the transformed data after # `transform_fn(features)` . If raw_data_dir is not # None, the raw data from the source db would be written # into another file. if os.path.exists(raw_data_dir): shutil.rmtree(raw_data_dir, ignore_errors=True) os.mkdir(raw_data_dir) if is_pai: for dmatrix in pai_dataset(fn, feature_metas, feature_column_names, label_meta, pai_table, pai_single_file, cache, rank, nworkers, batch_size=batch_size, feature_column_code=feature_column_code, raw_data_dir=raw_data_dir): yield dmatrix return conn = db.connect_with_data_source(datasource) gen = db.db_generator(conn, dataset_sql, label_meta)() selected_cols = db.selected_cols(conn, dataset_sql) for _ in six.moves.range(epoch): step = 0 # the filename per batch is [filename]_[step] step_file_name = "%s_%d" % (fn, step) written_rows = dump_dmatrix(step_file_name, gen, feature_column_names, feature_metas, label_meta, selected_cols, transform_fn=transform_fn, raw_data_dir=raw_data_dir) while written_rows > 0: yield load_dmatrix('{0}#{0}.cache'.format(step_file_name) if cache else step_file_name) os.remove(step_file_name) step += 1 step_file_name = "%s_%d" % (fn, step) written_rows = dump_dmatrix(step_file_name, gen, feature_column_names, feature_metas, label_meta, selected_cols, transform_fn=transform_fn, raw_data_dir=raw_data_dir)
def explain(datasource, select, explainer, model_params, result_table, model): """ Do explanation to a trained TensorFlow model. Args: datasource (str): the database connection string. select (str): the input data to predict. explainer (str): the explainer to explain the model. Not used in TensorFlow models. model_params (dict): the parameters for evaluation. result_table (str): the output data table. model (Model|str): the model object or where to load the model. Returns: None. """ if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance(model, Model), "not supported model type %s" % type(model) plot_type = model_params.get("summary.plot_type", "bar") train_attributes = model.get_meta("attributes") train_fc_map = model.get_meta("features") train_label_desc = model.get_meta("label").get_field_desc()[0] estimator_string = model.get_meta("class_name") save = "model_save" field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) feature_columns = compile_ir_feature_columns(train_fc_map, model.get_type()) label_name = model_params.get("label_col", train_label_desc.name) train_label_desc.name = label_name label_meta = train_label_desc.to_dict(dtype_to_string=True) if result_table: conn = db.connect_with_data_source(datasource) if estimator_string.startswith("BoostedTrees"): column_defs = [ "feature %s" % DataType.to_db_field_type(conn.driver, DataType.STRING), "dfc %s" % DataType.to_db_field_type(conn.driver, DataType.FLOAT32), "gain %s" % DataType.to_db_field_type(conn.driver, DataType.FLOAT32), ] else: selected_cols = db.selected_cols(conn, select) if label_name in selected_cols: selected_cols.remove(label_name) name_to_shape = dict([(fd.name, fd.shape) for fd in field_descs]) column_defs = [] float_field_type = DataType.to_db_field_type( conn.driver, DataType.FLOAT32) for name in selected_cols: shape = name_to_shape.get(name, None) if shape is None: raise ValueError("cannot find column %s" % name) size = int(np.prod(shape)) if size == 1: column_def = "%s %s" % (name, float_field_type) column_defs.append(column_def) else: for i in six.moves.range(size): column_def = "%s_%d %s" % (name, i, float_field_type) column_defs.append(column_def) drop_sql = "DROP TABLE IF EXISTS %s;" % result_table create_sql = "CREATE TABLE %s (%s);" % (result_table, ",".join(column_defs)) conn.execute(drop_sql) conn.execute(create_sql) conn.close() _explain(datasource=datasource, estimator_string=estimator_string, select=select, feature_columns=feature_columns, feature_column_names=feature_column_names, feature_metas=feature_metas, label_meta=label_meta, model_params=train_attributes, save=save, plot_type=plot_type, result_table=result_table) with open('summary.png', 'rb') as f: img = f.read() img = base64.b64encode(img) if six.PY3: img = img.decode('utf-8') img = "<div align='center'><img src='data:image/png;base64,%s' /></div>" \ % img print(img)
def keras_predict(estimator, model_params, save, result_table, is_pai, pai_table, feature_column_names, feature_metas, train_label_name, result_col_name, datasource, select, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass): classifier = init_model_with_feature_column(estimator, model_params) classifier_pkg = sys.modules[estimator.__module__] conn = None if is_pai: driver = "pai_maxcompute" else: conn = db.connect_with_data_source(datasource) driver = conn.driver if is_pai: pai_table_parts = pai_table.split(".") formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) gen = db.pai_maxcompute_db_generator(formatted_pai_table) selected_cols = feature_column_names else: gen = db.db_generator(conn, select) selected_cols = db.selected_cols(conn, select) def eval_input_fn(batch_size, cache=False): feature_types = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) tf_gen = tf_generator(gen, selected_cols, feature_column_names, feature_metas) dataset = tf.data.Dataset.from_generator(tf_gen, (tuple(feature_types), )) ds_mapper = functools.partial( parse_sparse_feature_predict, feature_column_names=feature_column_names, feature_metas=feature_metas) dataset = dataset.map(ds_mapper).batch(batch_size) if cache: dataset = dataset.cache() return dataset # NOTE: always use batch_size=1 when predicting to get the pairs of # features and predict results to insert into result table. pred_dataset = eval_input_fn(1) one_batch = next(iter(pred_dataset)) # NOTE: must run predict one batch to initialize parameters # see: https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models classifier.predict_on_batch(one_batch) classifier.load_weights(save) pred_dataset = eval_input_fn(1, cache=True).make_one_shot_iterator() column_names = selected_cols[:] train_label_index = selected_cols.index(train_label_name) if train_label_index != -1: del column_names[train_label_index] column_names.append(result_col_name) with db.buffered_db_writer(driver, conn, result_table, column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: for features in pred_dataset: result = classifier.predict_on_batch(features) # FIXME(typhoonzero): determine the predict result is classification by # adding the prediction result together to see if it is close to 1.0. if len(result[0]) == 1: # regression result result = result[0][0] else: sum = 0 for i in result[0]: sum += i if np.isclose(sum, 1.0): # classification result result = result[0].argmax(axis=-1) else: result = result[0] # multiple regression result row = [] for idx, name in enumerate(feature_column_names): val = features[name].numpy()[0][0] row.append(str(val)) if isinstance(result, np.ndarray): if len(result) > 1: # NOTE(typhoonzero): if the output dimension > 1, format output tensor # using a comma separated string. Only available for keras models. row.append(",".join([str(i) for i in result])) else: row.append(str(result[0])) else: row.append(str(result)) w.write(row) del pred_dataset
def estimator_predict(estimator, model_params, save, result_table, feature_column_names, feature_column_names_map, feature_columns, feature_metas, train_label_name, result_col_name, datasource, select, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass, is_pai, pai_table): if not is_pai: conn = db.connect_with_data_source(datasource) if is_pai: driver = "pai_maxcompute" conn = None pai_table_parts = pai_table.split(".") formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) selected_cols = db.pai_selected_cols(formatted_pai_table) predict_generator = db.pai_maxcompute_db_generator( formatted_pai_table)() else: driver = conn.driver # bypass all selected cols to the prediction result table selected_cols = db.selected_cols(conn, select) predict_generator = db.db_generator(conn, select)() write_cols = selected_cols[:] try: train_label_index = selected_cols.index(train_label_name) except ValueError: train_label_index = -1 if train_label_index != -1: del write_cols[train_label_index] write_cols.append(result_col_name) # load from the exported model with open("exported_path", "r") as fn: export_path = fn.read() if tf_is_version2(): imported = tf.saved_model.load(export_path) else: imported = tf.saved_model.load_v2(export_path) def add_to_example(example, x, i): feature_name = feature_column_names[i] dtype_str = feature_metas[feature_name]["dtype"] if feature_metas[feature_name]["delimiter"] != "": if feature_metas[feature_name]["is_sparse"]: # NOTE(typhoonzero): sparse feature will get (indices,values,shape) here, use indices only values = x[0][i][0].flatten() else: values = x[0][i].flatten() if dtype_str == "float32" or dtype_str == "float64": example.features.feature[feature_name].float_list.value.extend( list(values)) elif dtype_str == "int32" or dtype_str == "int64": example.features.feature[feature_name].int64_list.value.extend( list(values)) else: if "feature_columns" in feature_columns: idx = feature_column_names.index(feature_name) fc = feature_columns["feature_columns"][idx] else: # DNNLinearCombinedXXX have dnn_feature_columns and linear_feature_columns param. idx = -1 try: idx = feature_column_names_map[ "dnn_feature_columns"].index(feature_name) fc = feature_columns["dnn_feature_columns"][idx] except: try: idx = feature_column_names_map[ "linear_feature_columns"].index(feature_name) fc = feature_columns["linear_feature_columns"][idx] except: pass if idx == -1: raise ValueError( "can not found feature %s in all feature columns") if dtype_str == "float32" or dtype_str == "float64": # need to pass a tuple(float, ) example.features.feature[feature_name].float_list.value.extend( (float(x[0][i][0]), )) elif dtype_str == "int32" or dtype_str == "int64": numeric_type = type(tf.feature_column.numeric_column("tmp")) if type(fc) == numeric_type: example.features.feature[ feature_name].float_list.value.extend( (float(x[0][i][0]), )) else: example.features.feature[ feature_name].int64_list.value.extend( (int(x[0][i][0]), )) elif dtype_str == "string": example.features.feature[feature_name].bytes_list.value.extend( x[0][i]) def predict(x): example = tf.train.Example() for i in range(len(feature_column_names)): add_to_example(example, x, i) return imported.signatures["predict"]( examples=tf.constant([example.SerializeToString()])) with db.buffered_db_writer(driver, conn, result_table, write_cols, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: for row, _ in predict_generator: features = db.read_features_from_row(row, selected_cols, feature_column_names, feature_metas) result = predict((features, )) if train_label_index != -1 and len(row) > train_label_index: del row[train_label_index] if "class_ids" in result: row.append(str(result["class_ids"].numpy()[0][0])) else: # regression predictions row.append(str(result["predictions"].numpy()[0][0])) w.write(row)
def load_db_data_to_data_frame(datasource, select=None, odps_table=None, load_schema_only=False): if odps_table is None: conn = db.connect_with_data_source(datasource) selected_cols = db.selected_cols(conn, select) if load_schema_only: return pd.DataFrame(columns=selected_cols) generator = db.db_generator(conn, select) else: project, table = odps_table.split('.') conn = db.connect_with_data_source(datasource) schema = conn.get_table(table).schema selected_cols = [column.name for column in schema] if load_schema_only: return pd.DataFrame(columns=selected_cols) select_sql = "SELECT * FROM {}".format(table) instance = conn.execute_sql(select_sql) if not instance.is_successful(): raise ValueError('cannot get data from table {}.{}'.format( project, table)) def generator_func(): from odps import tunnel compress = tunnel.CompressOption.CompressAlgorithm.ODPS_ZLIB with instance.open_reader(tunnel=False, compress=compress) as reader: for record in reader: row_value = [ record[i] for i in six.moves.range(len(selected_cols)) ] yield row_value, None generator = generator_func dtypes = [None] * len(selected_cols) values = [[] for _ in six.moves.range(len(selected_cols))] for row_value, _ in generator(): for i, item in enumerate(row_value): if dtypes[i] == np.str: values[i].append(item) continue float_value = None try: float_value = float(item) except: pass if float_value is None: # cannot convert to float value dtypes[i] = np.str else: item = float_value int_value = long(item) if six.PY2 else int(item) if int_value != item: dtypes[i] = np.float64 values[i].append(item) numpy_dict = collections.OrderedDict() for col_name, dtype, value in six.moves.zip(selected_cols, dtypes, values): if dtype is None: dtype = np.int64 numpy_dict[col_name] = np.array(value, dtype=dtype) df = pd.DataFrame(data=numpy_dict) return df