def test_generator(self): conn = connect(testing.get_datasource()) # prepare test data conn.execute(self.drop_statement) conn.execute(self.create_statement) conn.execute(self.insert_statement) column_name_to_type = { "features": { "feature_name": "features", "delimiter": "", "dtype": "float32", "is_sparse": False, "shape": [] } } label_meta = {"feature_name": "label", "shape": [], "delimiter": ""} gen = db_generator(conn, "SELECT * FROM test_table_float_fea", label_meta) idx = 0 for row, label in gen(): features = read_features_from_row(row, ["features"], ["features"], column_name_to_type) d = (features, label) if idx == 0: self.assertEqual(d, (((1.0, ), ), 0)) elif idx == 1: self.assertEqual(d, (((2.0, ), ), 1)) idx += 1 self.assertEqual(idx, 2)
def reader(): for row, label in gen(): features = db.read_features_from_row(row, selected_cols, feature_column_names, feature_metas) if label is None: yield (features, ) else: yield (features, label)
def reader(): for row, label in gen(): features = db.read_features_from_row(row, selected_cols, feature_column_names, feature_metas) features = list(features) for i, f in enumerate(features): if len(f) == 1 and isinstance(f[0], np.ndarray): features[i] = f[0] features = tuple(features) if label is None: yield (features, ) else: yield (features, label)
def test_generator(self): driver = os.environ.get('SQLFLOW_TEST_DB') if driver == "mysql": database = "iris" user, password, host, port, database = testing_mysql_cfg() conn = connect(driver, database, user=user, password=password, host=host, port=int(port)) # prepare test data execute(driver, conn, self.drop_statement) execute(driver, conn, self.create_statement) execute(driver, conn, self.insert_statement) column_name_to_type = { "features": { "feature_name": "features", "delimiter": "", "dtype": "float32", "is_sparse": False, "shape": [] } } label_meta = { "feature_name": "label", "shape": [], "delimiter": "" } gen = db_generator(conn, "SELECT * FROM test_table_float_fea", label_meta) idx = 0 for row, label in gen(): features = read_features_from_row(row, ["features"], ["features"], column_name_to_type) d = (features, label) if idx == 0: self.assertEqual(d, (((1.0, ), ), 0)) elif idx == 1: self.assertEqual(d, (((2.0, ), ), 1)) idx += 1 self.assertEqual(idx, 2)
def xgb_shap_dataset(datasource, select, feature_column_names, label_meta, feature_metas, is_pai, pai_explain_table, transform_fn=None, feature_column_code=""): if is_pai: # (TODO: lhw) we may specify pai_explain_table in datasoure # and discard the condition statement here conn = PaiIOConnection.from_table(pai_explain_table) stream = db.db_generator(conn, None, label_meta) else: conn = db.connect_with_data_source(datasource) stream = db.db_generator(conn, select, label_meta) selected_cols = db.selected_cols(conn, select) if transform_fn: feature_names = transform_fn.get_feature_column_names() else: feature_names = feature_column_names xs = None dtypes = [] sizes = [] offsets = [] i = 0 for row, label in stream(): features = db.read_features_from_row(row, selected_cols, feature_column_names, feature_metas, is_xgboost=True) if transform_fn: features = transform_fn(features) flatten_features = [] for j, feature in enumerate(features): if len(feature) == 3: # convert sparse to dense col_indices, values, dense_shape = feature size = int(np.prod(dense_shape)) row_indices = np.zeros(shape=[col_indices.size]) sparse_matrix = scipy.sparse.csr_matrix( (values, (row_indices, col_indices)), shape=[1, size]) values = sparse_matrix.toarray() else: values = feature[0] if isinstance(values, np.ndarray): flatten_features.extend(values.flatten().tolist()) if i == 0: sizes.append(values.size) dtypes.append(infer_dtype(values)) else: flatten_features.append(values) if i == 0: sizes.append(1) dtypes.append(infer_dtype(values)) # Create the column name according to the feature number # of each column. # # If the column "c" contains only 1 feature, the result # column name would be "c" too. # # If the column "c" contains 3 features, # the result column name would be "c_0", "c_1" and "c_2" if i == 0: offsets = np.cumsum([0] + sizes) column_names = [] for j in six.moves.range(len(offsets) - 1): start = offsets[j] end = offsets[j + 1] if end - start == 1: column_names.append(feature_names[j]) else: for k in six.moves.range(start, end): column_names.append('{}_{}'.format( feature_names[j], k)) xs = pd.DataFrame(columns=column_names) xs.loc[i] = flatten_features i += 1 # NOTE(typhoonzero): set dtype to the feature's actual type, or the dtype # may be "object". Use below code to reproduce: # import pandas as pd # feature_column_names=["a", "b"] # xs = pd.DataFrame(columns=feature_column_names) # for i in range(10): # xs.loc[i] = [int(j) for j in range(2)] # print(xs.dtypes) columns = xs.columns for i, dtype in enumerate(dtypes): for j in six.moves.range(offsets[i], offsets[i + 1]): xs[columns[j]] = xs[columns[j]].astype(dtype) return xs
def estimator_predict(estimator, model_params, save, result_table, feature_column_names, feature_column_names_map, feature_columns, feature_metas, train_label_name, result_col_name, driver, conn, predict_generator, selected_cols, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass): write_cols = selected_cols[:] try: train_label_index = selected_cols.index(train_label_name) except ValueError: train_label_index = -1 if train_label_index != -1: del write_cols[train_label_index] write_cols.append(result_col_name) # load from the exported model with open("exported_path", "r") as fn: export_path = fn.read() if tf_is_version2(): imported = tf.saved_model.load(export_path) else: imported = tf.saved_model.load_v2(export_path) def add_to_example(example, x, i): feature_name = feature_column_names[i] dtype_str = feature_metas[feature_name]["dtype"] if feature_metas[feature_name]["delimiter"] != "": if feature_metas[feature_name]["is_sparse"]: # NOTE(typhoonzero): sparse feature will get # (indices,values,shape) here, use indices only values = x[0][i][0].flatten() else: values = x[0][i].flatten() if dtype_str == "float32" or dtype_str == "float64": example.features.feature[feature_name].float_list.value.extend( list(values)) elif dtype_str == "int32" or dtype_str == "int64": example.features.feature[feature_name].int64_list.value.extend( list(values)) else: if "feature_columns" in feature_columns: idx = feature_column_names.index(feature_name) fc = feature_columns["feature_columns"][idx] else: # DNNLinearCombinedXXX have dnn_feature_columns and # linear_feature_columns param. idx = -1 try: idx = feature_column_names_map[ "dnn_feature_columns"].index(feature_name) fc = feature_columns["dnn_feature_columns"][idx] except: # noqa: E722 try: idx = feature_column_names_map[ "linear_feature_columns"].index(feature_name) fc = feature_columns["linear_feature_columns"][idx] except: # noqa: E722 pass if idx == -1: raise ValueError( "can not found feature %s in all feature columns") if dtype_str == "float32" or dtype_str == "float64": # need to pass a tuple(float, ) example.features.feature[feature_name].float_list.value.extend( (float(x[0][i][0]), )) elif dtype_str == "int32" or dtype_str == "int64": numeric_type = type(tf.feature_column.numeric_column("tmp")) if type(fc) == numeric_type: example.features.feature[ feature_name].float_list.value.extend( (float(x[0][i][0]), )) else: example.features.feature[ feature_name].int64_list.value.extend( (int(x[0][i][0]), )) elif dtype_str == "string": example.features.feature[feature_name].bytes_list.value.extend( x[0][i]) def predict(x): example = tf.train.Example() for i in range(len(feature_column_names)): add_to_example(example, x, i) return imported.signatures["predict"]( examples=tf.constant([example.SerializeToString()])) with db.buffered_db_writer(driver, conn, result_table, write_cols, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: for row, _ in predict_generator(): features = db.read_features_from_row(row, selected_cols, feature_column_names, feature_metas) result = predict((features, )) if train_label_index != -1 and len(row) > train_label_index: del row[train_label_index] if "class_ids" in result: row.append(str(result["class_ids"].numpy()[0][0])) else: # regression predictions row.append(str(result["predictions"].numpy()[0][0])) w.write(row)
def estimator_predict(result_table, feature_column_names, feature_metas, train_label_name, result_col_name, conn, predict_generator, selected_cols): write_cols = selected_cols[:] try: train_label_index = selected_cols.index(train_label_name) except ValueError: train_label_index = -1 if train_label_index != -1: del write_cols[train_label_index] write_cols.append(result_col_name) # load from the exported model with open("exported_path", "r") as fn: export_path = fn.read() if tf_is_version2(): imported = tf.saved_model.load(export_path) else: imported = tf.saved_model.load_v2(export_path) def add_to_example(example, x, i): feature_name = feature_column_names[i] dtype_str = feature_metas[feature_name]["dtype"] if feature_metas[feature_name]["delimiter"] != "": if feature_metas[feature_name]["delimiter_kv"] != "": keys = x[0][i][0].flatten() weights = x[0][i][1].flatten() weight_dtype_str = feature_metas[feature_name]["dtype_weight"] if (dtype_str == "float32" or dtype_str == "float64" or dtype_str == DataType.FLOAT32): raise ValueError( "not supported key-value feature with key type float") elif (dtype_str == "int32" or dtype_str == "int64" or dtype_str == DataType.INT64): example.features.feature[ feature_name].int64_list.value.extend(list(keys)) elif (dtype_str == "string" or dtype_str == DataType.STRING): example.features.feature[ feature_name].bytes_list.value.extend(list(keys)) if (weight_dtype_str == "float32" or weight_dtype_str == "float64" or weight_dtype_str == DataType.FLOAT32): example.features.feature["_".join( [feature_name, "weight"])].float_list.value.extend(list(weights)) else: raise ValueError( "not supported key value column weight data type: %s" % weight_dtype_str) else: # NOTE(typhoonzero): sparse feature will get # (indices,values,shape) here, use indices only values = x[0][i][0].flatten() if (dtype_str == "float32" or dtype_str == "float64" or dtype_str == DataType.FLOAT32): example.features.feature[ feature_name].float_list.value.extend(list(values)) elif (dtype_str == "int32" or dtype_str == "int64" or dtype_str == DataType.INT64): example.features.feature[ feature_name].int64_list.value.extend(list(values)) else: if (dtype_str == "float32" or dtype_str == "float64" or dtype_str == DataType.FLOAT32): # need to pass a tuple(float, ) example.features.feature[feature_name].float_list.value.extend( (float(x[0][i][0]), )) elif (dtype_str == "int32" or dtype_str == "int64" or dtype_str == DataType.INT64): example.features.feature[feature_name].int64_list.value.extend( (int(x[0][i][0]), )) elif dtype_str == "string" or dtype_str == DataType.STRING: example.features.feature[feature_name].bytes_list.value.extend( x[0][i]) def predict(x): example = tf.train.Example() for i in range(len(feature_column_names)): add_to_example(example, x, i) return imported.signatures["predict"]( examples=tf.constant([example.SerializeToString()])) with db.buffered_db_writer(conn, result_table, write_cols, 100) as w: for row, _ in predict_generator(): features = db.read_features_from_row(row, selected_cols, feature_column_names, feature_metas, is_xgboost=False) result = predict((features, )) if train_label_index != -1 and len(row) > train_label_index: del row[train_label_index] if "class_ids" in result: row.append(str(result["class_ids"].numpy()[0][0])) else: # regression predictions row.append(str(result["predictions"].numpy()[0][0])) w.write(row)
def dump_dmatrix(filename, generator, feature_column_names, feature_metas, has_label, selected_cols, batch_size=None, transform_fn=None, raw_data_dir=None): # TODO(yancey1989): generate group and weight text file if necessary row_id = 0 if raw_data_dir: index = filename.rindex('/') + 1 if '/' in filename else 0 raw_data_fid = open(os.path.join(raw_data_dir, filename[index:]), 'a') else: raw_data_fid = None with open(filename, 'a') as f: for row, label in generator: features = db.read_features_from_row(row, selected_cols, feature_column_names, feature_metas, is_xgboost=True) if raw_data_fid is not None: raw_data_fid.write( DMATRIX_FILE_SEP.join([str(r) for r in row]) + "\n") if transform_fn: features = transform_fn(features) row_data = [] offset = 0 for i, v in enumerate(features): if len(v) == 1: # dense feature value = v[0] if isinstance(value, np.ndarray): value = value.reshape((-1, )) row_data.extend([ "{}:{}".format(i + offset, item) for i, item in enumerate(value) ]) offset += value.size else: row_data.append("{}:{}".format(offset, value)) offset += 1 else: # sparse feature indices = v[0] value = v[1].reshape((-1)) dense_size = np.prod(v[2]) row_data.extend([ "{}:{}".format(i + offset, item) for i, item in six.moves.zip(indices, value) ]) offset += dense_size if has_label: row_data = [str(label)] + row_data f.write(DMATRIX_FILE_SEP.join(row_data) + "\n") row_id += 1 # batch_size == None means use all data in generator if batch_size is None: continue if row_id >= batch_size: break # return rows written if raw_data_fid is not None: raw_data_fid.close() return row_id
def test_generator(self): conn = connect(testing.get_datasource()) # prepare test data conn.execute(self.drop_statement) conn.execute(self.create_statement) conn.execute(self.insert_statement) column_name_to_type = { "f1": { "feature_name": "f1", "delimiter": "", "dtype": "float32", "is_sparse": False, "shape": [] }, "f2": { "feature_name": "f2", "delimiter": "", "dtype": "int64", "is_sparse": False, "shape": [] }, "f3str": { "feature_name": "f3str", "delimiter": "", "dtype": "string", "is_sparse": False, "shape": [] }, "f4sparse": { "feature_name": "f4sparse", "delimiter": "", "dtype": "float32", "is_sparse": True, "shape": [], "format": "kv" }, "f5dense": { "feature_name": "f5dense", "delimiter": ",", "dtype": "int64", "is_sparse": False, "shape": [3] } } label_meta = {"feature_name": "label", "shape": [], "delimiter": ""} gen = db_generator(conn, "SELECT * FROM test_table_float_fea", label_meta) idx = 0 for row, label in gen(): if idx == 0: features = read_features_from_row( row, ["f1", "f2", "f3str", "f4sparse", "f5dense"], ["f1", "f2", "f3str", "f4sparse", "f5dense"], column_name_to_type) self.assertEqual(1.0, features[0][0]) self.assertEqual(1, features[1][0]) self.assertEqual('a', features[2][0]) self.assertTrue( np.array_equal(np.array([[1], [2]]), features[3][0])) self.assertTrue( np.array_equal(np.array([1., 2.], dtype=np.float32), features[3][1])) self.assertTrue( np.array_equal(np.array([1, 2, 3]), features[4][0])) self.assertEqual(0, label) elif idx == 1: try: features = read_features_from_row( row, ["f1", "f2", "f3str", "f4sparse", "f5dense"], ["f1", "f2", "f3str", "f4sparse", "f5dense"], column_name_to_type) except Exception as e: self.assertTrue(isinstance(e, ValueError)) features = read_features_from_row( row, ["f1", "f2", "f3str", "f4sparse", "f5dense"], ["f1", "f2", "f3str", "f4sparse", "f5dense"], column_name_to_type, is_xgboost=True) self.assertEqual(XGBOOST_NULL_MAGIC, features[0][0]) self.assertEqual(int(XGBOOST_NULL_MAGIC), features[1][0]) self.assertEqual("", features[2][0]) self.assertTrue(np.array_equal(np.array([]), features[3][0])) self.assertTrue(np.array_equal(np.array([]), features[3][1])) self.assertTrue( np.array_equal(np.array([1, 2, 3]), features[4][0])) self.assertEqual(1, label) idx += 1 self.assertEqual(idx, 2)