def get_explain_random_forest_pai_cmd(datasource, model_name, data_table, result_table, label_column): """Get a command to submit a PAI RandomForest explain task Args: datasource: current datasoruce model_name: model name on PAI data_table: input data table name result_table: name of the result table, PAI will automatically create this table label_column: name of the label column Returns: A string which is a PAI cmd """ # NOTE(typhoonzero): for PAI random forests predicting, we can not load # the TrainStmt since the model saving is fully done by PAI. We directly # use the columns in SELECT statement for prediction, error will be # reported by PAI job if the columns not match. if not label_column: return ("must specify WITH label_column when using " "pai random forest to explain models") conn = db.connect_with_data_source(datasource) schema = db.get_table_schema(conn, data_table) columns = [f[0] for f in schema] conn.execute("DROP TABLE IF EXISTS %s;" % result_table) return ( """pai -name feature_importance -project algo_public """ """-DmodelName="%s" -DinputTableName="%s" -DoutputTableName="%s" """ """-DlabelColName="%s" -DfeatureColNames="%s" """ ) % (model_name, data_table, result_table, label_column, ",".join(columns))
def get_explain_random_forests_cmd(datasource, model_name, data_table, result_table, label_column): """Get PAI random forest explanation command Args: datasource: current datasoruce model_name: model name on PAI data_table: input data table name result_table: result table name label_column: name of the label column Returns: a PAI cmd to explain the data using given model """ # NOTE(typhoonzero): for PAI random forests predicting, we can not load # the TrainStmt since the model saving is fully done by PAI. We directly # use the columns in SELECT statement for prediction, error will be # reported by PAI job if the columns not match. if not label_column: raise SQLFlowDiagnostic("must specify WITH label_column when using " "pai random forest to explain models") conn = db.connect_with_data_source(datasource) # drop result table if exists conn.execute("DROP TABLE IF EXISTS %s;" % result_table) schema = db.get_table_schema(conn, data_table) fields = [f[0] for f in schema if f[0] != label_column] return ('''pai -name feature_importance -project algo_public ''' '''-DmodelName="%s" -DinputTableName="%s" ''' '''-DoutputTableName="%s" -DlabelColName="%s" ''' '''-DfeatureColNames="%s" ''') % (model_name, data_table, result_table, label_column, ",".join(fields))
def get_pai_predict_cmd(datasource, project, oss_model_path, model_name, predict_table, result_table, model_type, model_params, job_file, params_file, cwd): """Get predict command for PAI task Args: datasource: current datasource project: current project oss_model_path: the place to load model model_name: model used to do prediction predict_table: where to store the tmp prediction data set result_table: prediction result model_type: type of th model, see also get_oss_saved_model_type model_params: parameters specified by WITH clause job_file: tar file incldue code and libs to execute on PAI params_file: extra params file cwd: current working dir Returns: The command to submit PAI prediction task """ # NOTE(typhoonzero): for PAI machine learning toolkit predicting, we can # not load the TrainStmt since the model saving is fully done by PAI. # We directly use the columns in SELECT statement for prediction, error # will be reported by PAI job if the columns not match. conf = cluster_conf.get_cluster_config(model_params) conn = db.connect_with_data_source(datasource) if model_type == EstimatorType.PAIML: schema = db.get_table_schema(conn, predict_table) result_fields = [col[0] for col in schema] return ('''pai -name prediction -DmodelName="%s" ''' '''-DinputTableName="%s" -DoutputTableName="%s" ''' '''-DfeatureColNames="%s" -DappendColNames="%s"''') % ( model_name, predict_table, result_table, ",".join(result_fields), ",".join(result_fields)) else: schema = db.get_table_schema(conn, result_table) result_fields = [col[0] for col in schema] # For TensorFlow and XGBoost, we build a pai-tf cmd to submit the task return get_pai_tf_cmd(conf, job_file, params_file, ENTRY_FILE, model_name, oss_model_path, predict_table, "", result_table, project)
def test_verify_column_name_and_type(self): conn = testing.get_singleton_db_connection() train_table = "iris.train" test_table = "iris.test" train_select = [ "petal_length", "petal_width", "sepal_length", "sepal_width", "class" ] test_select = train_select verify_column_name_and_type( conn, self.generate_select(train_table, train_select), self.generate_select(test_table, test_select), "class") test_select = [ "petal_length", "petal_width", "sepal_length", "sepal_width" ] verify_column_name_and_type( conn, self.generate_select(train_table, train_select), self.generate_select(test_table, test_select), "class") test_select = ["petal_length", "petal_width", "sepal_length"] with self.assertRaises(ValueError): verify_column_name_and_type( conn, self.generate_select(train_table, train_select), self.generate_select(test_table, test_select), "class") cursor = conn.cursor() name_and_type = dict(db.get_table_schema(conn, test_table)) new_table_name = "iris.verifier_test_table" name_and_type["petal_length"] = "VARCHAR(255)" # change the data type create_column_str = ",".join( ["%s %s" % (n, t) for n, t in name_and_type.items()]) drop_sql = "DROP TABLE IF EXISTS %s" % new_table_name create_sql = "CREATE TABLE %s(%s)" % (new_table_name, create_column_str) cursor.execute(drop_sql) cursor.execute(create_sql) with self.assertRaises(ValueError): test_select = train_select verify_column_name_and_type( conn, self.generate_select(train_table, train_select), self.generate_select(new_table_name, test_select), "class") cursor.execute(drop_sql) cursor.close()
def get_create_shap_result_sql(conn, data_table, result_table, label_column): """Get a sql statement which create a result table for SHAP Args: conn: a database connection data_table: table name to read data from result_table: result table name label_column: column name of label Returns: a sql statement to create SHAP result table """ schema = db.get_table_schema(conn, data_table) fields = ["%s STRING" % f[0] for f in schema if f[0] != label_column] return "CREATE TABLE IF NOT EXISTS %s (%s)" % (result_table, ",".join(fields))
def create_predict_result_table(datasource, select, result_table, label_column, train_label_column, model_type): """Create predict result table with given name and label column Args: datasource: current datasource select: sql statement to get prediction data set result_table: the table name to save result label_column: name of the label column, if not exist in select result, we will add a int column in the result table train_label_column: name of the label column when training model_type: type of model defined in runtime.model.oss """ conn = db.connect_with_data_source(datasource) conn.execute("DROP TABLE IF EXISTS %s" % result_table) # PAI ml will create result table itself if model_type == EstimatorType.PAIML: return create_table_sql = "CREATE TABLE %s AS SELECT * FROM %s LIMIT 0" % ( result_table, select) conn.execute(create_table_sql) # if label is not in data table, add a int column for it schema = db.get_table_schema(conn, result_table) col_type = "INT" for (name, ctype) in schema: if name == train_label_column or name == label_column: col_type = ctype break col_names = [col[0] for col in schema] if label_column not in col_names: conn.execute( conn, "ALTER TABLE %s ADD %s %s" % (result_table, label_column, col_type)) if train_label_column != label_column and train_label_column in col_names: conn.execute( conn, "ALTER TABLE %s DROP COLUMN %s" % (result_table, train_label_column))
def test_get_table_schema(self): driver = testing.get_driver() conn = testing.get_singleton_db_connection() if driver == "mysql": schema = get_table_schema(conn, "iris.train") expect = ( ('sepal_length', 'FLOAT'), ('sepal_width', 'FLOAT'), ('petal_length', 'FLOAT'), ('petal_width', 'FLOAT'), ('class', 'INT(11)'), ) self.assertTrue(np.array_equal(expect, schema)) schema = selected_columns_and_types( conn, "SELECT sepal_length, petal_width * 2.3 new_petal_width, " "class FROM iris.train") expect = [ ("sepal_length", "FLOAT"), ("new_petal_width", "DOUBLE"), ("class", "INT"), ] self.assertTrue(np.array_equal(expect, schema)) elif driver == "hive": schema = get_table_schema(conn, "iris.train") expect = ( ('sepal_length', 'FLOAT'), ('sepal_width', 'FLOAT'), ('petal_length', 'FLOAT'), ('petal_width', 'FLOAT'), ('class', 'INT'), ) self.assertTrue(np.array_equal(expect, schema)) schema = selected_columns_and_types( conn, "SELECT sepal_length, petal_width * 2.3 AS new_petal_width, " "class FROM iris.train") expect = [ ("sepal_length", "FLOAT"), ("new_petal_width", "FLOAT"), ("class", "INT"), ] self.assertTrue(np.array_equal(expect, schema)) elif driver == "maxcompute": case_db = os.getenv("SQLFLOW_TEST_DB_MAXCOMPUTE_PROJECT") table = "%s.sqlflow_test_iris_train" % case_db schema = get_table_schema(conn, table) expect = [ ('sepal_length', 'DOUBLE'), ('sepal_width', 'DOUBLE'), ('petal_length', 'DOUBLE'), ('petal_width', 'DOUBLE'), ('class', 'BIGINT'), ] self.assertTrue(np.array_equal(expect, schema)) schema = selected_columns_and_types( conn, "SELECT sepal_length, petal_width * 2.3 new_petal_width, " "class FROM %s" % table) expect = [ ("sepal_length", "DOUBLE"), ("new_petal_width", "DOUBLE"), ("class", "BIGINT"), ] self.assertTrue(np.array_equal(expect, schema))