def pred(datasource, estimator_string, select, result_table, feature_columns, feature_column_names, feature_column_names_map, train_label_name, result_col_name, feature_metas={}, model_params={}, pred_params={}, save="", batch_size=1, pai_table=""): estimator = import_model(estimator_string) model_params.update(feature_columns) is_estimator = is_tf_estimator(estimator) if pai_table != "": conn = PaiIOConnection.from_table(pai_table) selected_cols = db.selected_cols(conn, None) predict_generator = db.db_generator(conn, None) else: conn = db.connect_with_data_source(datasource) selected_cols = db.selected_cols(conn, select) predict_generator = db.db_generator(conn, select) pop_optimizer_and_loss(model_params) if pred_params is None: extra_result_cols = [] else: extra_result_cols = pred_params.get("extra_outputs", "") extra_result_cols = [ c.strip() for c in extra_result_cols.split(",") if c.strip() ] if not is_estimator: if not issubclass(estimator, tf.keras.Model): # functional model need field_metas parameter model_params["field_metas"] = feature_metas print("Start predicting using keras model...") keras_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, train_label_name, result_col_name, conn, predict_generator, selected_cols, extra_result_cols) else: # TODO(sneaxiy): support extra_result_cols for estimator model_params['model_dir'] = save print("Start predicting using estimator model...") estimator_predict(result_table, feature_column_names, feature_metas, train_label_name, result_col_name, conn, predict_generator, selected_cols) print("Done predicting. Predict table : %s" % result_table)
def pai_download_table_data_worker(dname, feature_metas, feature_column_names, label_meta, pai_table, slice_id, slice_count, feature_column_code, raw_data_dir): import runtime.xgboost as xgboost_extended if isinstance(feature_column_code, dict): # NOTE(typhoonzero): feature_column_code is a dict of # runtime.feature.column in refactored step code. feature_column_transformers = compile_ir_feature_columns( feature_column_code, EstimatorType.XGBOOST) transform_fn = \ xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *feature_column_transformers["feature_columns"]) else: feature_column_transformers = eval('[{}]'.format(feature_column_code)) transform_fn = \ xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *feature_column_transformers) conn = PaiIOConnection.from_table(pai_table, slice_id, slice_count) gen = db.db_generator(conn, None, label_meta=label_meta)() selected_cols = db.selected_cols(conn, None) filename = "{}/{}.txt".format(dname, slice_id) dump_dmatrix(filename, gen, feature_column_names, feature_metas, label_meta, selected_cols, transform_fn=transform_fn, raw_data_dir=raw_data_dir)
def test_generate_fetch_size(self): label_meta = {"feature_name": "label", "shape": [], "delimiter": ""} gen = db_generator(testing.get_singleton_db_connection(), 'SELECT * FROM iris.train limit 10', label_meta, fetch_size=4) self.assertEqual(len([g for g in gen()]), 10)
def test_generate_fetch_size(self): driver = os.environ.get('SQLFLOW_TEST_DB') if driver == "mysql": user, password, host, port, database = testing_mysql_cfg() conn = connect(driver, database, user=user, password=password, host=host, port=port) column_name_to_type = { "sepal_length": { "feature_name": "sepal_length", "delimiter": "", "dtype": "float32", "is_sparse": False, "shape": [] } } label_meta = { "feature_name": "label", "shape": [], "delimiter": "" } gen = db_generator(conn, 'SELECT * FROM iris.train limit 10', label_meta, fetch_size=4) self.assertEqual(len([g for g in gen()]), 10)
def test_generator(self): conn = connect(testing.get_datasource()) # prepare test data conn.execute(self.drop_statement) conn.execute(self.create_statement) conn.execute(self.insert_statement) column_name_to_type = { "features": { "feature_name": "features", "delimiter": "", "dtype": "float32", "is_sparse": False, "shape": [] } } label_meta = {"feature_name": "label", "shape": [], "delimiter": ""} gen = db_generator(conn, "SELECT * FROM test_table_float_fea", label_meta) idx = 0 for row, label in gen(): features = read_features_from_row(row, ["features"], ["features"], column_name_to_type) d = (features, label) if idx == 0: self.assertEqual(d, (((1.0, ), ), 0)) elif idx == 1: self.assertEqual(d, (((2.0, ), ), 1)) idx += 1 self.assertEqual(idx, 2)
def input_fn(select, datasource, feature_column_names, feature_metas, label_meta, is_pai=False, pai_table="", num_workers=1, worker_id=0): feature_types = [] shapes = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: if feature_metas[name]["delimiter_kv"]: # extract two features from generator data. feature_types.append( (get_dtype(feature_metas[name]["dtype"]), get_dtype(feature_metas[name]["dtype_weight"]), tf.int64)) shapes.append((None, None, None)) else: feature_types.append((tf.int64, tf.int32, tf.int64)) shapes.append((None, None, None)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) shapes.append(feature_metas[name]["shape"]) if is_pai: pai_table = "odps://{}/tables/{}".format(*pai_table.split(".")) return pai_dataset(pai_table, feature_column_names, label_meta, feature_metas, slice_id=worker_id, slice_count=num_workers) else: conn = db.connect_with_data_source(datasource) gen = db.db_generator(conn, select, label_meta) selected_cols = db.selected_cols(conn, select) gen = tf_generator(gen, selected_cols, feature_column_names, feature_metas) # Clustering model do not have label if not label_meta or label_meta["feature_name"] == "": dataset = tf.data.Dataset.from_generator(gen, (tuple(feature_types), ), (tuple(shapes), )) ds_mapper = functools.partial( parse_sparse_feature_predict, feature_column_names=feature_column_names, feature_metas=feature_metas) else: dataset = tf.data.Dataset.from_generator( gen, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"])), (tuple(shapes), label_meta["shape"])) ds_mapper = functools.partial( parse_sparse_feature, feature_column_names=feature_column_names, feature_metas=feature_metas) return dataset.map(ds_mapper)
def pred(datasource, estimator_string, select, result_table, feature_columns, feature_column_names, feature_column_names_map, train_label_name, result_col_name, feature_metas={}, model_params={}, save="", batch_size=1, hdfs_namenode_addr="", hive_location="", hdfs_user="", hdfs_pass=""): estimator = import_model(estimator_string) model_params.update(feature_columns) is_estimator = is_tf_estimator(estimator) conn = db.connect_with_data_source(datasource) driver = conn.driver predict_generator = db.db_generator(conn, select) selected_cols = db.selected_cols(conn, select) if not is_estimator: if not issubclass(estimator, tf.keras.Model): # functional model need field_metas parameter model_params["field_metas"] = feature_metas print("Start predicting using keras model...") keras_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, train_label_name, result_col_name, driver, conn, predict_generator, selected_cols, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) else: model_params['model_dir'] = save print("Start predicting using estimator model...") estimator_predict(estimator, model_params, save, result_table, feature_column_names, feature_column_names_map, feature_columns, feature_metas, train_label_name, result_col_name, driver, conn, predict_generator, selected_cols, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) print("Done predicting. Predict table : %s" % result_table)
def test_generator(self): driver = os.environ.get('SQLFLOW_TEST_DB') if driver == "mysql": database = "iris" user, password, host, port, database = testing_mysql_cfg() conn = connect(driver, database, user=user, password=password, host=host, port=int(port)) # prepare test data execute(driver, conn, self.drop_statement) execute(driver, conn, self.create_statement) execute(driver, conn, self.insert_statement) column_name_to_type = { "features": { "feature_name": "features", "delimiter": "", "dtype": "float32", "is_sparse": False, "shape": [] } } label_meta = { "feature_name": "label", "shape": [], "delimiter": "" } gen = db_generator(conn, "SELECT * FROM test_table_float_fea", label_meta) idx = 0 for row, label in gen(): features = read_features_from_row(row, ["features"], ["features"], column_name_to_type) d = (features, label) if idx == 0: self.assertEqual(d, (((1.0, ), ), 0)) elif idx == 1: self.assertEqual(d, (((2.0, ), ), 1)) idx += 1 self.assertEqual(idx, 2)
def pai_download_table_data_worker(dname, feature_metas, feature_column_names, label_meta, pai_table, slice_id, slice_count, feature_column_code, raw_data_dir): import runtime.xgboost as xgboost_extended feature_column_transformers = eval('[{}]'.format(feature_column_code)) transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer( feature_column_names, *feature_column_transformers) conn = PaiIOConnection.from_table(pai_table, slice_id, slice_count) gen = db.db_generator(conn, None, label_meta=label_meta)() selected_cols = db.selected_cols(conn, None) filename = "{}/{}.txt".format(dname, slice_id) dump_dmatrix(filename, gen, feature_column_names, feature_metas, label_meta, selected_cols, transform_fn=transform_fn, raw_data_dir=raw_data_dir)
def _predict(datasource, estimator_string, select, result_table, feature_columns, feature_column_names, feature_column_names_map, train_label_name, result_col_name, feature_metas={}, model_params={}, save="", batch_size=1, pai_table=""): estimator = import_model(estimator_string) model_params.update(feature_columns) is_estimator = is_tf_estimator(estimator) conn = PaiIOConnection.from_table(pai_table) selected_cols = db.selected_cols(conn, None) predict_generator = db.db_generator(conn, None) pop_optimizer_and_loss(model_params) if not is_estimator: if not issubclass(estimator, tf.keras.Model): # functional model need field_metas parameter model_params["field_metas"] = feature_metas print("Start predicting using keras model...") keras_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, train_label_name, result_col_name, conn, predict_generator, selected_cols) else: model_params['model_dir'] = save print("Start predicting using estimator model...") estimator_predict(result_table, feature_column_names, feature_metas, train_label_name, result_col_name, conn, predict_generator, selected_cols) print("Done predicting. Predict table : %s" % result_table)
def fetch_samples(conn, query, n=1): ''' Fetch n sample(s) at most according to the query statement. Args: conn: the connection object. query (str): the select SQL statement. n (int): the maximum sample number to query. Query all samples if n < 0. Returns: A generator which yields each row of the data. ''' query = db.limit_select(query, n) gen = db.db_generator(conn, query) # Note: Only when the iteration begins, we can get # gen.field_names and gen.field_types. So we take # the first element in the generator first, and # set field_names and field_types to the returned # result. gen_iter = iter(gen()) rows = next(gen_iter, None) if rows is None: # No fetch data, just return None return None def reader(): r = rows while r is not None: # r = (row_data, label_data), and label_data is None here yield r[0] r = next(gen_iter, None) reader.field_names = gen.field_names reader.field_types = gen.field_types return reader
def xgb_shap_dataset(datasource, select, feature_column_names, label_meta, feature_metas, is_pai, pai_explain_table, transform_fn=None, feature_column_code=""): if is_pai: # (TODO: lhw) we may specify pai_explain_table in datasoure # and discard the condition statement here conn = PaiIOConnection.from_table(pai_explain_table) stream = db.db_generator(conn, None, label_meta) else: conn = db.connect_with_data_source(datasource) stream = db.db_generator(conn, select, label_meta) selected_cols = db.selected_cols(conn, select) if transform_fn: feature_names = transform_fn.get_feature_column_names() else: feature_names = feature_column_names xs = None dtypes = [] sizes = [] offsets = [] i = 0 for row, label in stream(): features = db.read_features_from_row(row, selected_cols, feature_column_names, feature_metas, is_xgboost=True) if transform_fn: features = transform_fn(features) flatten_features = [] for j, feature in enumerate(features): if len(feature) == 3: # convert sparse to dense col_indices, values, dense_shape = feature size = int(np.prod(dense_shape)) row_indices = np.zeros(shape=[col_indices.size]) sparse_matrix = scipy.sparse.csr_matrix( (values, (row_indices, col_indices)), shape=[1, size]) values = sparse_matrix.toarray() else: values = feature[0] if isinstance(values, np.ndarray): flatten_features.extend(values.flatten().tolist()) if i == 0: sizes.append(values.size) dtypes.append(infer_dtype(values)) else: flatten_features.append(values) if i == 0: sizes.append(1) dtypes.append(infer_dtype(values)) # Create the column name according to the feature number # of each column. # # If the column "c" contains only 1 feature, the result # column name would be "c" too. # # If the column "c" contains 3 features, # the result column name would be "c_0", "c_1" and "c_2" if i == 0: offsets = np.cumsum([0] + sizes) column_names = [] for j in six.moves.range(len(offsets) - 1): start = offsets[j] end = offsets[j + 1] if end - start == 1: column_names.append(feature_names[j]) else: for k in six.moves.range(start, end): column_names.append('{}_{}'.format( feature_names[j], k)) xs = pd.DataFrame(columns=column_names) xs.loc[i] = flatten_features i += 1 # NOTE(typhoonzero): set dtype to the feature's actual type, or the dtype # may be "object". Use below code to reproduce: # import pandas as pd # feature_column_names=["a", "b"] # xs = pd.DataFrame(columns=feature_column_names) # for i in range(10): # xs.loc[i] = [int(j) for j in range(2)] # print(xs.dtypes) columns = xs.columns for i, dtype in enumerate(dtypes): for j in six.moves.range(offsets[i], offsets[i + 1]): xs[columns[j]] = xs[columns[j]].astype(dtype) return xs
def predict_step(datasource, select, result_table, label_name, model, pai_table=None): if isinstance(model, six.string_types): model = Model.load_from_db(datasource, model) else: assert isinstance(model, Model), "not supported model type %s" % type(model) model_params = model.get_meta("attributes") train_fc_map = model.get_meta("features") label_meta = model.get_meta("label") train_label_desc = label_meta.get_field_desc()[0] if label_meta else None train_label_name = train_label_desc.name if train_label_desc else None estimator_string = model.get_meta("class_name") save = "model_save" field_descs = get_ordered_field_descs(train_fc_map) feature_column_names = [fd.name for fd in field_descs] feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True)) for fd in field_descs]) feature_columns = compile_ir_feature_columns(train_fc_map, model.get_type()) is_pai = True if pai_table else False if is_pai: select = "SELECT * FROM %s" % pai_table conn = db.connect_with_data_source(datasource) result_column_names, train_label_idx = create_predict_table( conn, select, result_table, train_label_desc, label_name) if is_pai: conn.close() conn = PaiIOConnection.from_table(pai_table) select = None selected_cols = result_column_names[0:-1] if train_label_idx >= 0: selected_cols = selected_cols[0:train_label_idx] + [ train_label_name ] + selected_cols[train_label_idx:] estimator = import_model(estimator_string) model_params.update(feature_columns) is_estimator = is_tf_estimator(estimator) predict_generator = db.db_generator(conn, select) pop_optimizer_and_loss(model_params) if not is_estimator: if not issubclass(estimator, tf.keras.Model): # functional model need field_metas parameter model_params["field_metas"] = feature_metas print("Start predicting using keras model...") keras_predict(estimator, model_params, save, result_table, feature_column_names, feature_metas, train_label_name, label_name, conn, predict_generator, selected_cols) else: model_params['model_dir'] = save print("Start predicting using estimator model...") estimator_predict(result_table, feature_column_names, feature_metas, train_label_name, label_name, conn, predict_generator, selected_cols) print("Done predicting. Predict table : %s" % result_table) conn.close()
def xgb_dataset(datasource, fn, dataset_sql, feature_metas, feature_column_names, label_meta, is_pai=False, pai_table="", pai_single_file=False, cache=False, batch_size=None, epoch=1, rank=0, nworkers=1, transform_fn=None, feature_column_code="", raw_data_dir=None): if raw_data_dir: # raw_data_dir is needed when predicting. Because we # should write the raw data from the source db into # the dest db, instead of the transformed data after # `transform_fn(features)` . If raw_data_dir is not # None, the raw data from the source db would be written # into another file. if os.path.exists(raw_data_dir): shutil.rmtree(raw_data_dir, ignore_errors=True) os.mkdir(raw_data_dir) if is_pai: for dmatrix in pai_dataset(fn, feature_metas, feature_column_names, label_meta, pai_table, pai_single_file, cache, rank, nworkers, batch_size=batch_size, feature_column_code=feature_column_code, raw_data_dir=raw_data_dir): yield dmatrix return conn = db.connect_with_data_source(datasource) gen = db.db_generator(conn, dataset_sql, label_meta)() selected_cols = db.selected_cols(conn, dataset_sql) for _ in six.moves.range(epoch): step = 0 # the filename per batch is [filename]_[step] step_file_name = "%s_%d" % (fn, step) written_rows = dump_dmatrix(step_file_name, gen, feature_column_names, feature_metas, label_meta, selected_cols, transform_fn=transform_fn, raw_data_dir=raw_data_dir) while written_rows > 0: yield load_dmatrix('{0}#{0}.cache'.format(step_file_name) if cache else step_file_name) os.remove(step_file_name) step += 1 step_file_name = "%s_%d" % (fn, step) written_rows = dump_dmatrix(step_file_name, gen, feature_column_names, feature_metas, label_meta, selected_cols, transform_fn=transform_fn, raw_data_dir=raw_data_dir)
def test_generator(self): conn = connect(testing.get_datasource()) # prepare test data conn.execute(self.drop_statement) conn.execute(self.create_statement) conn.execute(self.insert_statement) column_name_to_type = { "f1": { "feature_name": "f1", "delimiter": "", "dtype": "float32", "is_sparse": False, "shape": [] }, "f2": { "feature_name": "f2", "delimiter": "", "dtype": "int64", "is_sparse": False, "shape": [] }, "f3str": { "feature_name": "f3str", "delimiter": "", "dtype": "string", "is_sparse": False, "shape": [] }, "f4sparse": { "feature_name": "f4sparse", "delimiter": "", "dtype": "float32", "is_sparse": True, "shape": [], "format": "kv" }, "f5dense": { "feature_name": "f5dense", "delimiter": ",", "dtype": "int64", "is_sparse": False, "shape": [3] } } label_meta = {"feature_name": "label", "shape": [], "delimiter": ""} gen = db_generator(conn, "SELECT * FROM test_table_float_fea", label_meta) idx = 0 for row, label in gen(): if idx == 0: features = read_features_from_row( row, ["f1", "f2", "f3str", "f4sparse", "f5dense"], ["f1", "f2", "f3str", "f4sparse", "f5dense"], column_name_to_type) self.assertEqual(1.0, features[0][0]) self.assertEqual(1, features[1][0]) self.assertEqual('a', features[2][0]) self.assertTrue( np.array_equal(np.array([[1], [2]]), features[3][0])) self.assertTrue( np.array_equal(np.array([1., 2.], dtype=np.float32), features[3][1])) self.assertTrue( np.array_equal(np.array([1, 2, 3]), features[4][0])) self.assertEqual(0, label) elif idx == 1: try: features = read_features_from_row( row, ["f1", "f2", "f3str", "f4sparse", "f5dense"], ["f1", "f2", "f3str", "f4sparse", "f5dense"], column_name_to_type) except Exception as e: self.assertTrue(isinstance(e, ValueError)) features = read_features_from_row( row, ["f1", "f2", "f3str", "f4sparse", "f5dense"], ["f1", "f2", "f3str", "f4sparse", "f5dense"], column_name_to_type, is_xgboost=True) self.assertEqual(XGBOOST_NULL_MAGIC, features[0][0]) self.assertEqual(int(XGBOOST_NULL_MAGIC), features[1][0]) self.assertEqual("", features[2][0]) self.assertTrue(np.array_equal(np.array([]), features[3][0])) self.assertTrue(np.array_equal(np.array([]), features[3][1])) self.assertTrue( np.array_equal(np.array([1, 2, 3]), features[4][0])) self.assertEqual(1, label) idx += 1 self.assertEqual(idx, 2)
def keras_predict(estimator, model_params, save, result_table, is_pai, pai_table, feature_column_names, feature_metas, train_label_name, result_col_name, datasource, select, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass): classifier = init_model_with_feature_column(estimator, model_params) classifier_pkg = sys.modules[estimator.__module__] conn = None if is_pai: driver = "pai_maxcompute" else: conn = db.connect_with_data_source(datasource) driver = conn.driver if is_pai: pai_table_parts = pai_table.split(".") formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) gen = db.pai_maxcompute_db_generator(formatted_pai_table) selected_cols = feature_column_names else: gen = db.db_generator(conn, select) selected_cols = db.selected_cols(conn, select) def eval_input_fn(batch_size, cache=False): feature_types = [] for name in feature_column_names: # NOTE: vector columns like 23,21,3,2,0,0 should use shape None if feature_metas[name]["is_sparse"]: feature_types.append((tf.int64, tf.int32, tf.int64)) else: feature_types.append(get_dtype(feature_metas[name]["dtype"])) tf_gen = tf_generator(gen, selected_cols, feature_column_names, feature_metas) dataset = tf.data.Dataset.from_generator(tf_gen, (tuple(feature_types), )) ds_mapper = functools.partial( parse_sparse_feature_predict, feature_column_names=feature_column_names, feature_metas=feature_metas) dataset = dataset.map(ds_mapper).batch(batch_size) if cache: dataset = dataset.cache() return dataset # NOTE: always use batch_size=1 when predicting to get the pairs of # features and predict results to insert into result table. pred_dataset = eval_input_fn(1) one_batch = next(iter(pred_dataset)) # NOTE: must run predict one batch to initialize parameters # see: https://www.tensorflow.org/alpha/guide/keras/saving_and_serializing#saving_subclassed_models classifier.predict_on_batch(one_batch) classifier.load_weights(save) pred_dataset = eval_input_fn(1, cache=True).make_one_shot_iterator() column_names = selected_cols[:] train_label_index = selected_cols.index(train_label_name) if train_label_index != -1: del column_names[train_label_index] column_names.append(result_col_name) with db.buffered_db_writer(driver, conn, result_table, column_names, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: for features in pred_dataset: result = classifier.predict_on_batch(features) # FIXME(typhoonzero): determine the predict result is classification by # adding the prediction result together to see if it is close to 1.0. if len(result[0]) == 1: # regression result result = result[0][0] else: sum = 0 for i in result[0]: sum += i if np.isclose(sum, 1.0): # classification result result = result[0].argmax(axis=-1) else: result = result[0] # multiple regression result row = [] for idx, name in enumerate(feature_column_names): val = features[name].numpy()[0][0] row.append(str(val)) if isinstance(result, np.ndarray): if len(result) > 1: # NOTE(typhoonzero): if the output dimension > 1, format output tensor # using a comma separated string. Only available for keras models. row.append(",".join([str(i) for i in result])) else: row.append(str(result[0])) else: row.append(str(result)) w.write(row) del pred_dataset
def estimator_predict(estimator, model_params, save, result_table, feature_column_names, feature_column_names_map, feature_columns, feature_metas, train_label_name, result_col_name, datasource, select, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass, is_pai, pai_table): if not is_pai: conn = db.connect_with_data_source(datasource) if is_pai: driver = "pai_maxcompute" conn = None pai_table_parts = pai_table.split(".") formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) selected_cols = db.pai_selected_cols(formatted_pai_table) predict_generator = db.pai_maxcompute_db_generator( formatted_pai_table)() else: driver = conn.driver # bypass all selected cols to the prediction result table selected_cols = db.selected_cols(conn, select) predict_generator = db.db_generator(conn, select)() write_cols = selected_cols[:] try: train_label_index = selected_cols.index(train_label_name) except ValueError: train_label_index = -1 if train_label_index != -1: del write_cols[train_label_index] write_cols.append(result_col_name) # load from the exported model with open("exported_path", "r") as fn: export_path = fn.read() if tf_is_version2(): imported = tf.saved_model.load(export_path) else: imported = tf.saved_model.load_v2(export_path) def add_to_example(example, x, i): feature_name = feature_column_names[i] dtype_str = feature_metas[feature_name]["dtype"] if feature_metas[feature_name]["delimiter"] != "": if feature_metas[feature_name]["is_sparse"]: # NOTE(typhoonzero): sparse feature will get (indices,values,shape) here, use indices only values = x[0][i][0].flatten() else: values = x[0][i].flatten() if dtype_str == "float32" or dtype_str == "float64": example.features.feature[feature_name].float_list.value.extend( list(values)) elif dtype_str == "int32" or dtype_str == "int64": example.features.feature[feature_name].int64_list.value.extend( list(values)) else: if "feature_columns" in feature_columns: idx = feature_column_names.index(feature_name) fc = feature_columns["feature_columns"][idx] else: # DNNLinearCombinedXXX have dnn_feature_columns and linear_feature_columns param. idx = -1 try: idx = feature_column_names_map[ "dnn_feature_columns"].index(feature_name) fc = feature_columns["dnn_feature_columns"][idx] except: try: idx = feature_column_names_map[ "linear_feature_columns"].index(feature_name) fc = feature_columns["linear_feature_columns"][idx] except: pass if idx == -1: raise ValueError( "can not found feature %s in all feature columns") if dtype_str == "float32" or dtype_str == "float64": # need to pass a tuple(float, ) example.features.feature[feature_name].float_list.value.extend( (float(x[0][i][0]), )) elif dtype_str == "int32" or dtype_str == "int64": numeric_type = type(tf.feature_column.numeric_column("tmp")) if type(fc) == numeric_type: example.features.feature[ feature_name].float_list.value.extend( (float(x[0][i][0]), )) else: example.features.feature[ feature_name].int64_list.value.extend( (int(x[0][i][0]), )) elif dtype_str == "string": example.features.feature[feature_name].bytes_list.value.extend( x[0][i]) def predict(x): example = tf.train.Example() for i in range(len(feature_column_names)): add_to_example(example, x, i) return imported.signatures["predict"]( examples=tf.constant([example.SerializeToString()])) with db.buffered_db_writer(driver, conn, result_table, write_cols, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: for row, _ in predict_generator: features = db.read_features_from_row(row, selected_cols, feature_column_names, feature_metas) result = predict((features, )) if train_label_index != -1 and len(row) > train_label_index: del row[train_label_index] if "class_ids" in result: row.append(str(result["class_ids"].numpy()[0][0])) else: # regression predictions row.append(str(result["predictions"].numpy()[0][0])) w.write(row)
def load_db_data_to_data_frame(datasource, select=None, odps_table=None, load_schema_only=False): if odps_table is None: conn = db.connect_with_data_source(datasource) selected_cols = db.selected_cols(conn, select) if load_schema_only: return pd.DataFrame(columns=selected_cols) generator = db.db_generator(conn, select) else: project, table = odps_table.split('.') conn = db.connect_with_data_source(datasource) schema = conn.get_table(table).schema selected_cols = [column.name for column in schema] if load_schema_only: return pd.DataFrame(columns=selected_cols) select_sql = "SELECT * FROM {}".format(table) instance = conn.execute_sql(select_sql) if not instance.is_successful(): raise ValueError('cannot get data from table {}.{}'.format( project, table)) def generator_func(): from odps import tunnel compress = tunnel.CompressOption.CompressAlgorithm.ODPS_ZLIB with instance.open_reader(tunnel=False, compress=compress) as reader: for record in reader: row_value = [ record[i] for i in six.moves.range(len(selected_cols)) ] yield row_value, None generator = generator_func dtypes = [None] * len(selected_cols) values = [[] for _ in six.moves.range(len(selected_cols))] for row_value, _ in generator(): for i, item in enumerate(row_value): if dtypes[i] == np.str: values[i].append(item) continue float_value = None try: float_value = float(item) except: pass if float_value is None: # cannot convert to float value dtypes[i] = np.str else: item = float_value int_value = long(item) if six.PY2 else int(item) if int_value != item: dtypes[i] = np.float64 values[i].append(item) numpy_dict = collections.OrderedDict() for col_name, dtype, value in six.moves.zip(selected_cols, dtypes, values): if dtype is None: dtype = np.int64 numpy_dict[col_name] = np.array(value, dtype=dtype) df = pd.DataFrame(data=numpy_dict) return df