Ejemplo n.º 1
0
def get_explain_random_forest_pai_cmd(datasource, model_name, data_table,
                                      result_table, label_column):
    """Get a command to submit a PAI RandomForest explain task

    Args:
        datasource: current datasoruce
        model_name: model name on PAI
        data_table: input data table name
        result_table: name of the result table, PAI will automatically
            create this table
        label_column: name of the label column

    Returns:
        A string which is a PAI cmd
    """
    # NOTE(typhoonzero): for PAI random forests predicting, we can not load
    # the TrainStmt since the model saving is fully done by PAI. We directly
    # use the columns in SELECT statement for prediction, error will be
    # reported by PAI job if the columns not match.
    if not label_column:
        return ("must specify WITH label_column when using "
                "pai random forest to explain models")

    conn = db.connect_with_data_source(datasource)
    schema = db.get_table_schema(conn, data_table)
    columns = [f[0] for f in schema]
    conn.execute("DROP TABLE IF EXISTS %s;" % result_table)
    return (
        """pai -name feature_importance -project algo_public """
        """-DmodelName="%s" -DinputTableName="%s"  -DoutputTableName="%s" """
        """-DlabelColName="%s" -DfeatureColNames="%s" """
    ) % (model_name, data_table, result_table, label_column, ",".join(columns))
Ejemplo n.º 2
0
def get_explain_random_forests_cmd(datasource, model_name, data_table,
                                   result_table, label_column):
    """Get PAI random forest explanation command

    Args:
        datasource: current datasoruce
        model_name: model name on PAI
        data_table: input data table name
        result_table: result table name
        label_column: name of the label column

    Returns:
        a PAI cmd to explain the data using given model
    """
    # NOTE(typhoonzero): for PAI random forests predicting, we can not load
    # the TrainStmt since the model saving is fully done by PAI. We directly
    # use the columns in SELECT statement for prediction, error will be
    # reported by PAI job if the columns not match.
    if not label_column:
        raise SQLFlowDiagnostic("must specify WITH label_column when using "
                                "pai random forest to explain models")

    conn = db.connect_with_data_source(datasource)
    # drop result table if exists
    conn.execute("DROP TABLE IF EXISTS %s;" % result_table)
    schema = db.get_table_schema(conn, data_table)
    fields = [f[0] for f in schema if f[0] != label_column]
    return ('''pai -name feature_importance -project algo_public '''
            '''-DmodelName="%s" -DinputTableName="%s"  '''
            '''-DoutputTableName="%s" -DlabelColName="%s" '''
            '''-DfeatureColNames="%s" ''') % (model_name, data_table,
                                              result_table, label_column,
                                              ",".join(fields))
Ejemplo n.º 3
0
def get_pai_predict_cmd(datasource, project, oss_model_path, model_name,
                        predict_table, result_table, model_type, model_params,
                        job_file, params_file, cwd):
    """Get predict command for PAI task

    Args:
        datasource: current datasource
        project: current project
        oss_model_path: the place to load model
        model_name: model used to do prediction
        predict_table: where to store the tmp prediction data set
        result_table: prediction result
        model_type: type of th model, see also get_oss_saved_model_type
        model_params: parameters specified by WITH clause
        job_file: tar file incldue code and libs to execute on PAI
        params_file: extra params file
        cwd: current working dir

    Returns:
        The command to submit PAI prediction task
    """
    # NOTE(typhoonzero): for PAI machine learning toolkit predicting, we can
    # not load the TrainStmt since the model saving is fully done by PAI.
    # We directly use the columns in SELECT statement for prediction, error
    # will be reported by PAI job if the columns not match.
    conf = cluster_conf.get_cluster_config(model_params)
    conn = db.connect_with_data_source(datasource)
    if model_type == EstimatorType.PAIML:
        schema = db.get_table_schema(conn, predict_table)
        result_fields = [col[0] for col in schema]
        return ('''pai -name prediction -DmodelName="%s"  '''
                '''-DinputTableName="%s"  -DoutputTableName="%s"  '''
                '''-DfeatureColNames="%s"  -DappendColNames="%s"''') % (
                    model_name, predict_table, result_table,
                    ",".join(result_fields), ",".join(result_fields))
    else:
        schema = db.get_table_schema(conn, result_table)
        result_fields = [col[0] for col in schema]
        # For TensorFlow and XGBoost, we build a pai-tf cmd to submit the task
        return get_pai_tf_cmd(conf, job_file, params_file, ENTRY_FILE,
                              model_name, oss_model_path, predict_table, "",
                              result_table, project)
Ejemplo n.º 4
0
    def test_verify_column_name_and_type(self):
        conn = testing.get_singleton_db_connection()

        train_table = "iris.train"
        test_table = "iris.test"

        train_select = [
            "petal_length", "petal_width", "sepal_length", "sepal_width",
            "class"
        ]
        test_select = train_select
        verify_column_name_and_type(
            conn, self.generate_select(train_table, train_select),
            self.generate_select(test_table, test_select), "class")

        test_select = [
            "petal_length", "petal_width", "sepal_length", "sepal_width"
        ]
        verify_column_name_and_type(
            conn, self.generate_select(train_table, train_select),
            self.generate_select(test_table, test_select), "class")

        test_select = ["petal_length", "petal_width", "sepal_length"]
        with self.assertRaises(ValueError):
            verify_column_name_and_type(
                conn, self.generate_select(train_table, train_select),
                self.generate_select(test_table, test_select), "class")

        cursor = conn.cursor()
        name_and_type = dict(db.get_table_schema(conn, test_table))
        new_table_name = "iris.verifier_test_table"

        name_and_type["petal_length"] = "VARCHAR(255)"  # change the data type
        create_column_str = ",".join(
            ["%s %s" % (n, t) for n, t in name_and_type.items()])

        drop_sql = "DROP TABLE IF EXISTS %s" % new_table_name
        create_sql = "CREATE TABLE %s(%s)" % (new_table_name,
                                              create_column_str)
        cursor.execute(drop_sql)
        cursor.execute(create_sql)
        with self.assertRaises(ValueError):
            test_select = train_select
            verify_column_name_and_type(
                conn, self.generate_select(train_table, train_select),
                self.generate_select(new_table_name, test_select), "class")
        cursor.execute(drop_sql)
        cursor.close()
Ejemplo n.º 5
0
def get_create_shap_result_sql(conn, data_table, result_table, label_column):
    """Get a sql statement which create a result table for SHAP

    Args:
        conn: a database connection
        data_table: table name to read data from
        result_table: result table name
        label_column: column name of label

    Returns:
        a sql statement to create SHAP result table
    """
    schema = db.get_table_schema(conn, data_table)
    fields = ["%s STRING" % f[0] for f in schema if f[0] != label_column]
    return "CREATE TABLE IF NOT EXISTS %s (%s)" % (result_table,
                                                   ",".join(fields))
Ejemplo n.º 6
0
def create_predict_result_table(datasource, select, result_table, label_column,
                                train_label_column, model_type):
    """Create predict result table with given name and label column

    Args:
        datasource: current datasource
        select: sql statement to get prediction data set
        result_table: the table name to save result
        label_column: name of the label column, if not exist in select
            result, we will add a int column in the result table
        train_label_column: name of the label column when training
        model_type: type of model defined in runtime.model.oss
    """
    conn = db.connect_with_data_source(datasource)
    conn.execute("DROP TABLE IF EXISTS %s" % result_table)
    # PAI ml will create result table itself
    if model_type == EstimatorType.PAIML:
        return

    create_table_sql = "CREATE TABLE %s AS SELECT * FROM %s LIMIT 0" % (
        result_table, select)
    conn.execute(create_table_sql)

    # if label is not in data table, add a int column for it
    schema = db.get_table_schema(conn, result_table)
    col_type = "INT"
    for (name, ctype) in schema:
        if name == train_label_column or name == label_column:
            col_type = ctype
            break
    col_names = [col[0] for col in schema]
    if label_column not in col_names:
        conn.execute(
            conn, "ALTER TABLE %s ADD %s %s" %
            (result_table, label_column, col_type))
    if train_label_column != label_column and train_label_column in col_names:
        conn.execute(
            conn, "ALTER TABLE %s DROP COLUMN %s" %
            (result_table, train_label_column))
Ejemplo n.º 7
0
    def test_get_table_schema(self):
        driver = testing.get_driver()
        conn = testing.get_singleton_db_connection()
        if driver == "mysql":
            schema = get_table_schema(conn, "iris.train")
            expect = (
                ('sepal_length', 'FLOAT'),
                ('sepal_width', 'FLOAT'),
                ('petal_length', 'FLOAT'),
                ('petal_width', 'FLOAT'),
                ('class', 'INT(11)'),
            )
            self.assertTrue(np.array_equal(expect, schema))

            schema = selected_columns_and_types(
                conn,
                "SELECT sepal_length, petal_width * 2.3 new_petal_width, "
                "class FROM iris.train")
            expect = [
                ("sepal_length", "FLOAT"),
                ("new_petal_width", "DOUBLE"),
                ("class", "INT"),
            ]
            self.assertTrue(np.array_equal(expect, schema))
        elif driver == "hive":
            schema = get_table_schema(conn, "iris.train")
            expect = (
                ('sepal_length', 'FLOAT'),
                ('sepal_width', 'FLOAT'),
                ('petal_length', 'FLOAT'),
                ('petal_width', 'FLOAT'),
                ('class', 'INT'),
            )
            self.assertTrue(np.array_equal(expect, schema))

            schema = selected_columns_and_types(
                conn,
                "SELECT sepal_length, petal_width * 2.3 AS new_petal_width, "
                "class FROM iris.train")
            expect = [
                ("sepal_length", "FLOAT"),
                ("new_petal_width", "FLOAT"),
                ("class", "INT"),
            ]
            self.assertTrue(np.array_equal(expect, schema))
        elif driver == "maxcompute":
            case_db = os.getenv("SQLFLOW_TEST_DB_MAXCOMPUTE_PROJECT")
            table = "%s.sqlflow_test_iris_train" % case_db
            schema = get_table_schema(conn, table)
            expect = [
                ('sepal_length', 'DOUBLE'),
                ('sepal_width', 'DOUBLE'),
                ('petal_length', 'DOUBLE'),
                ('petal_width', 'DOUBLE'),
                ('class', 'BIGINT'),
            ]
            self.assertTrue(np.array_equal(expect, schema))

            schema = selected_columns_and_types(
                conn,
                "SELECT sepal_length, petal_width * 2.3 new_petal_width, "
                "class FROM %s" % table)
            expect = [
                ("sepal_length", "DOUBLE"),
                ("new_petal_width", "DOUBLE"),
                ("class", "BIGINT"),
            ]
            self.assertTrue(np.array_equal(expect, schema))