def reader(): for row, label in gen(): features = db.read_features_from_row(row, selected_cols, feature_column_names, feature_metas) if label is None: yield (features, ) else: yield (features, label)
def test_generator(self): driver = os.environ.get('SQLFLOW_TEST_DB') if driver == "mysql": database = "iris" user, password, host, port, database = testing_mysql_cfg() conn = connect(driver, database, user=user, password=password, host=host, port=int(port)) # prepare test data execute(driver, conn, self.drop_statement) execute(driver, conn, self.create_statement) execute(driver, conn, self.insert_statement) column_name_to_type = { "features": { "feature_name": "features", "delimiter": "", "dtype": "float32", "is_sparse": False, "shape": [] } } label_spec = { "feature_name": "label", "shape": [], "delimiter": "" } gen = db_generator(driver, conn, "SELECT * FROM test_table_float_fea", ["features"], label_spec, column_name_to_type) idx = 0 for row, label in gen(): features = read_features_from_row(row, ["features"], ["features"], column_name_to_type) d = (features, label) if idx == 0: self.assertEqual(d, (((1.0, ), ), 0)) elif idx == 1: self.assertEqual(d, (((2.0, ), ), 1)) idx += 1 self.assertEqual(idx, 2)
def xgb_shap_dataset(datasource, select, feature_column_names, label_spec, feature_specs, is_pai, pai_explain_table): label_column_name = label_spec["feature_name"] if is_pai: pai_table_parts = pai_explain_table.split(".") formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) stream = db.pai_maxcompute_db_generator(formatted_pai_table, feature_column_names, label_column_name, feature_specs) selected_cols = feature_column_names[:] else: conn = db.connect_with_data_source(datasource) stream = db.db_generator(conn.driver, conn, select, feature_column_names, label_spec, feature_specs) selected_cols = db.selected_cols(conn.driver, conn, select) xs = pd.DataFrame(columns=feature_column_names) i = 0 for row, label in stream(): features = db.read_features_from_row(row, selected_cols, feature_column_names, feature_specs) xs.loc[i] = [item[0] for item in features] i += 1 # NOTE(typhoonzero): set dtype to the feature's actual type, or the dtype # may be "object". Use below code to reproduce: # import pandas as pd # feature_column_names=["a", "b"] # xs = pd.DataFrame(columns=feature_column_names) # for i in range(10): # xs.loc[i] = [int(j) for j in range(2)] # print(xs.dtypes) for fname in feature_column_names: dtype = feature_specs[fname]["dtype"] xs[fname] = xs[fname].astype(dtype) return xs
def dump_dmatrix(filename, generator, feature_column_names, feature_specs, has_label, selected_cols, batch_size=None): # TODO(yancey1989): generate group and weight text file if necessary row_id = 0 with open(filename, 'a') as f: for row, label in generator: features = db.read_features_from_row(row, selected_cols, feature_column_names, feature_specs) row_data = [] for i, v in enumerate(features): fname = feature_column_names[i] dtype = feature_specs[fname]["dtype"] if dtype == "int32" or dtype == "int64": row_data.append("%d:%d" % (i, v[0] or 0)) elif dtype == "float32" or dtype == "float64": row_data.append("%d:%f" % (i, v[0] or 0)) else: raise ValueError( "not supported columnt dtype %s for xgboost" % dtype) if has_label: row_data = [str(label)] + row_data f.write("\t".join(row_data) + "\n") row_id += 1 # batch_size == None meas use all data in generator if batch_size == None: continue if row_id >= batch_size: break # return rows written return row_id
def xgb_shap_dataset(datasource, select, feature_column_names, label_spec, feature_specs, is_pai, pai_explain_table, transform_fn=None, feature_column_code=""): label_column_name = label_spec["feature_name"] if is_pai: pai_table_parts = pai_explain_table.split(".") formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) stream = db.pai_maxcompute_db_generator(formatted_pai_table, feature_column_names, label_column_name, feature_specs) selected_cols = db.pai_selected_cols(formatted_pai_table) else: conn = db.connect_with_data_source(datasource) stream = db.db_generator(conn.driver, conn, select, feature_column_names, label_spec, feature_specs) selected_cols = db.selected_cols(conn.driver, conn, select) if transform_fn: column_names = transform_fn.get_column_names() else: column_names = feature_column_names # NOTE(sneaxiy): pandas.DataFrame does not support Tensor whose rank is larger than 2. # But `INDICATOR` would generate one hot vector for each element, and pandas.DataFrame # would not accept `INDICATOR` results as its input. In a word, we do not support # `TO EXPLAIN` when using `INDICATOR`. xs = pd.DataFrame(columns=column_names) dtypes = [] i = 0 for row, label in stream(): features = db.read_features_from_row(row, selected_cols, feature_column_names, feature_specs) if transform_fn: features = transform_fn(features) # TODO(sneaxiy): support sparse features in `TO EXPLAIN` features = [item[0] for item in features] xs.loc[i] = features if i == 0: for f in features: if isinstance(f, np.ndarray): if f.dtype == np.float32 or f.dtype == np.float64: dtypes.append('float32') elif f.dtype == np.int32 or f.dtype == np.int64: dtypes.append('int64') else: raise ValueError('Not supported data type {}'.format( f.dtype)) elif isinstance(f, (np.float32, np.float64, float)): dtypes.append('float32') elif isinstance(f, (np.int32, np.int64, six.integer_types)): dtypes.append('int64') else: raise ValueError('Not supported data type {}'.format( type(f))) i += 1 # NOTE(typhoonzero): set dtype to the feature's actual type, or the dtype # may be "object". Use below code to reproduce: # import pandas as pd # feature_column_names=["a", "b"] # xs = pd.DataFrame(columns=feature_column_names) # for i in range(10): # xs.loc[i] = [int(j) for j in range(2)] # print(xs.dtypes) for dtype, name in zip(dtypes, column_names): xs[name] = xs[name].astype(dtype) return xs
def estimator_predict(estimator, model_params, save, result_table, feature_column_names, feature_column_names_map, feature_columns, feature_metas, result_col_name, datasource, select, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass, is_pai, pai_table): if not is_pai: conn = db.connect_with_data_source(datasource) column_names = feature_column_names[:] column_names.append(result_col_name) if is_pai: driver = "pai_maxcompute" conn = None pai_table_parts = pai_table.split(".") formatted_pai_table = "odps://%s/tables/%s" % (pai_table_parts[0], pai_table_parts[1]) selected_cols = db.pai_selected_cols(formatted_pai_table) predict_generator = db.pai_maxcompute_db_generator( formatted_pai_table, feature_column_names, None, feature_metas)() else: driver = conn.driver # bypass all selected cols to the prediction result table selected_cols = db.selected_cols(conn.driver, conn, select) predict_generator = db.db_generator(conn.driver, conn, select, feature_column_names, None, feature_metas)() write_cols, target_col_index = write_cols_from_selected( result_col_name, selected_cols) # load from the exported model with open("exported_path", "r") as fn: export_path = fn.read() if tf_is_version2(): imported = tf.saved_model.load(export_path) else: imported = tf.saved_model.load_v2(export_path) def add_to_example(example, x, i): feature_name = feature_column_names[i] dtype_str = feature_metas[feature_name]["dtype"] if feature_metas[feature_name]["delimiter"] != "": if feature_metas[feature_name]["is_sparse"]: # NOTE(typhoonzero): sparse feature will get (indices,values,shape) here, use indices only values = x[0][i][0].flatten() else: values = x[0][i].flatten() if dtype_str == "float32" or dtype_str == "float64": example.features.feature[feature_name].float_list.value.extend( list(values)) elif dtype_str == "int32" or dtype_str == "int64": example.features.feature[feature_name].int64_list.value.extend( list(values)) else: if "feature_columns" in feature_columns: idx = feature_column_names.index(feature_name) fc = feature_columns["feature_columns"][idx] else: # DNNLinearCombinedXXX have dnn_feature_columns and linear_feature_columns param. idx = -1 try: idx = feature_column_names_map[ "dnn_feature_columns"].index(feature_name) fc = feature_columns["dnn_feature_columns"][idx] except: try: idx = feature_column_names_map[ "linear_feature_columns"].index(feature_name) fc = feature_columns["linear_feature_columns"][idx] except: pass if idx == -1: raise ValueError( "can not found feature %s in all feature columns") if dtype_str == "float32" or dtype_str == "float64": # need to pass a tuple(float, ) example.features.feature[feature_name].float_list.value.extend( (float(x[0][i][0]), )) elif dtype_str == "int32" or dtype_str == "int64": numeric_type = type(tf.feature_column.numeric_column("tmp")) if type(fc) == numeric_type: example.features.feature[ feature_name].float_list.value.extend( (float(x[0][i][0]), )) else: example.features.feature[ feature_name].int64_list.value.extend( (int(x[0][i][0]), )) elif dtype_str == "string": example.features.feature[feature_name].bytes_list.value.extend( x[0][i]) def predict(x): example = tf.train.Example() for i in range(len(feature_column_names)): add_to_example(example, x, i) return imported.signatures["predict"]( examples=tf.constant([example.SerializeToString()])) with db.buffered_db_writer(driver, conn, result_table, write_cols, 100, hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass) as w: for row, _ in predict_generator: features = db.read_features_from_row(row, selected_cols, feature_column_names, feature_metas) result = predict((features, )) if target_col_index != -1: del row[target_col_index] if "class_ids" in result: row.append(str(result["class_ids"].numpy()[0][0])) else: # regression predictions row.append(str(result["predictions"].numpy()[0][0])) w.write(row)
def dump_dmatrix(filename, generator, feature_column_names, feature_specs, has_label, selected_cols, batch_size=None, transform_fn=None, raw_data_dir=None): # TODO(yancey1989): generate group and weight text file if necessary row_id = 0 if raw_data_dir: index = filename.rindex('/') + 1 if '/' in filename else 0 raw_data_fid = open(os.path.join(raw_data_dir, filename[index:]), 'a') else: raw_data_fid = None with open(filename, 'a') as f: for row, label in generator: features = db.read_features_from_row(row, selected_cols, feature_column_names, feature_specs) if raw_data_fid is not None: row_data = ["{}:{}".format(i, r) for i, r in enumerate(row)] raw_data_fid.write("\t".join(row_data) + "\n") if transform_fn: features = transform_fn(features) row_data = [] offset = 0 for i, v in enumerate(features): if len(v) == 1: # dense feature value = v[0] if isinstance(value, np.ndarray): value = value.reshape((-1, )) row_data.extend([ "{}:{}".format(i + offset, item) for i, item in enumerate(value) ]) offset += value.size else: row_data.append("{}:{}".format(offset, value)) offset += 1 else: # sparse feature indices = v[0] value = v[1].reshape((-1)) dense_size = np.prod(v[2]) row_data.extend([ "{}:{}".format(i + offset, item) for i, item in six.moves.zip(indices, value) ]) offset += dense_size if has_label: row_data = [str(label)] + row_data f.write("\t".join(row_data) + "\n") row_id += 1 # batch_size == None meas use all data in generator if batch_size == None: continue if row_id >= batch_size: break # return rows written if raw_data_fid is not None: raw_data_fid.close() return row_id