Exemple #1
0
def get_explain_random_forests_cmd(datasource, model_name, data_table,
                                   result_table, label_column):
    """Get PAI random forest explanation command

    Args:
        datasource: current datasoruce
        model_name: model name on PAI
        data_table: input data table name
        result_table: result table name
        label_column: name of the label column

    Returns:
        a PAI cmd to explain the data using given model
    """
    # NOTE(typhoonzero): for PAI random forests predicting, we can not load
    # the TrainStmt since the model saving is fully done by PAI. We directly
    # use the columns in SELECT statement for prediction, error will be
    # reported by PAI job if the columns not match.
    if not label_column:
        raise SQLFlowDiagnostic("must specify WITH label_column when using "
                                "pai random forest to explain models")

    conn = db.connect_with_data_source(datasource)
    # drop result table if exists
    db.execute(conn, "DROP TABLE IF EXISTS %s;" % result_table)
    schema = db.get_table_schema(conn, data_table)
    fields = [f[0] for f in schema if f[0] != label_column]
    return ('''pai -name feature_importance -project algo_public '''
            '''-DmodelName="%s" -DinputTableName="%s"  '''
            '''-DoutputTableName="%s" -DlabelColName="%s" '''
            '''-DfeatureColNames="%s" ''') % (model_name, data_table,
                                              result_table, label_column,
                                              ",".join(fields))
Exemple #2
0
def get_explain_random_forest_pai_cmd(datasource, model_name, data_table,
                                      result_table, label_column):
    """Get a command to submit a PAI RandomForest explain task

    Args:
        datasource: current datasoruce
        model_name: model name on PAI
        data_table: input data table name
        result_table: name of the result table, PAI will automatically
            create this table
        label_column: name of the label column

    Returns:
        A string which is a PAI cmd
    """
    # NOTE(typhoonzero): for PAI random forests predicting, we can not load
    # the TrainStmt since the model saving is fully done by PAI. We directly
    # use the columns in SELECT statement for prediction, error will be
    # reported by PAI job if the columns not match.
    if not label_column:
        return ("must specify WITH label_column when using "
                "pai random forest to explain models")

    conn = db.connect_with_data_source(datasource)
    schema = db.get_table_schema(conn, data_table)
    columns = [f[0] for f in schema]
    db.execute(conn, "DROP TABLE IF EXISTS %s;" % result_table)
    return (
        """pai -name feature_importance -project algo_public """
        """-DmodelName="%s" -DinputTableName="%s"  -DoutputTableName="%s" """
        """-DlabelColName="%s" -DfeatureColNames="%s" """
    ) % (model_name, data_table, result_table, label_column, ",".join(columns))
Exemple #3
0
def drop_tables(tables, datasource):
    """Drop given tables in datasource"""
    conn = db.connect_with_data_source(datasource)
    try:
        for table in tables:
            if table != "":
                drop_sql = "DROP TABLE IF EXISTS %s" % table
                db.execute(conn, drop_sql)
    except:  # noqa: E722
        # odps will clear table itself, so even fail here, we do
        # not need to raise error
        print("Encounter error on drop tmp table")
Exemple #4
0
def create_explain_result_table(datasource, data_table, result_table,
                                model_type, estimator, label_column):
    """Create explain result table from given datasource

    Args:
        datasource: current datasource
        data_table: input data table name
        result_table: table name to store the result
        model_type: type of the model to use
        estimator: estimator class if the model is TensorFlow estimator
        label_column: column name of the predict label
    """
    conn = db.connect_with_data_source(datasource)
    drop_stmt = "DROP TABLE IF EXISTS %s" % result_table
    db.execute(conn, drop_stmt)

    create_stmt = ""
    if model_type == EstimatorType.PAIML:
        return
    elif model_type == EstimatorType.TENSORFLOW:
        if estimator.startswith("BoostedTrees"):
            column_def = ""
            if conn.driver == "mysql":
                column_def = "(feature VARCHAR(255), dfc FLOAT, gain FLOAT)"
            else:
                # Hive & MaxCompute
                column_def = "(feature STRING, dfc STRING, gain STRING)"
            create_stmt = "CREATE TABLE IF NOT EXISTS %s %s;" % (result_table,
                                                                 column_def)
        else:
            if not label_column:
                raise SQLFlowDiagnostic(
                    "need to specify WITH label_col=lable_col_name "
                    "when explaining deep models")
            create_stmt = get_create_shap_result_sql(conn, data_table,
                                                     result_table,
                                                     label_column)
    elif model_type == EstimatorType.XGBOOST:
        if not label_column:
            raise SQLFlowDiagnostic(
                "need to specify WITH label_col=lable_col_name "
                "when explaining xgboost models")
        create_stmt = get_create_shap_result_sql(conn, data_table,
                                                 result_table, label_column)
    else:
        raise SQLFlowDiagnostic(
            "not supported modelType %d for creating Explain result table" %
            model_type)

    if not db.execute(conn, create_stmt):
        raise SQLFlowDiagnostic("Can't create explain result table")
Exemple #5
0
def create_evaluate_result_table(datasource, result_table, metrics):
    """Create a table to hold the evaluation result

    Args:
        datasource: current datasource
        result_table: the table name to save result
        metrics: list of evaluation metrics names
    """
    drop_tables([result_table], datasource)
    # Always add loss
    ext_metrics = ["loss"]
    if isinstance(metrics, list):
        ext_metrics.extend(metrics)
    fields = ["%s STRING" % m for m in ext_metrics]
    sql = "CREATE TABLE IF NOT EXISTS %s (%s);" % (result_table,
                                                   ",".join(fields))
    conn = db.connect_with_data_source(datasource)
    db.execute(conn, sql)
Exemple #6
0
def get_train_kmeans_pai_cmd(datasource, model_name, data_table, model_attrs,
                             feature_column_names):
    """Get a command to submit a KMeans training task to PAI

    Args:
        datasource: current datasoruce
        model_name: model name on PAI
        data_table: input data table name
        model_attrs: model attributes for KMeans
        feature_column_names: names of feature columns

    Returns:
        A string which is a PAI cmd
    """
    [
        model_attrs.update({k: v}) for k, v in default_attrs.items()
        if k not in model_attrs
    ]
    center_count = model_attrs["center_count"]
    idx_table_name = model_attrs["idx_table_name"]
    if not idx_table_name:
        raise SQLFlowDiagnostic("Need to set idx_table_name in WITH clause")
    exclude_columns = model_attrs["excluded_columns"].split(",")

    # selectedCols indicates feature columns used to clustering
    selected_cols = [
        fc for fc in feature_column_names if fc not in exclude_columns
    ]

    conn = db.connect_with_data_source(datasource)
    db.execute(conn, "DROP TABLE IF EXISTS %s" % idx_table_name)

    return (
        """pai -name kmeans -project algo_public """
        """-DinputTableName=%s -DcenterCount=%d -DmodelName %s """
        """-DidxTableName=%s -DselectedColNames="%s" -DappendColNames="%s" """
    ) % (data_table, center_count, model_name, idx_table_name,
         ",".join(selected_cols), ",".join(feature_column_names))
Exemple #7
0
def create_tmp_table_from_select(select, datasource):
    """Create temp table for given select query

    Args:
        select: string, the selection statement
        datasource: string, the datasource to connect
    """
    if not select:
        return None
    conn = db.connect_with_data_source(datasource)
    project = get_project(datasource)
    tmp_tb_name = gen_rand_string()
    create_sql = "CREATE TABLE %s LIFECYCLE %s AS %s" % (
        tmp_tb_name, LIFECYCLE_ON_TMP_TABLE, select)
    # (NOTE: lhw) maxcompute conn doesn't support close
    # we should unify db interface
    if not db.execute(conn, create_sql):
        raise SQLFlowDiagnostic("Can't crate tmp table for %s" % select)
    return "%s.%s" % (project, tmp_tb_name)
Exemple #8
0
def create_predict_result_table(datasource, select, result_table, label_column,
                                train_label_column, model_type):
    """Create predict result table with given name and label column

    Args:
        datasource: current datasource
        select: sql statement to get prediction data set
        result_table: the table name to save result
        label_column: name of the label column, if not exist in select
            result, we will add a int column in the result table
        train_label_column: name of the label column when training
        model_type: type of model defined in runtime.oss
    """
    conn = db.connect_with_data_source(datasource)
    db.execute(conn, "DROP TABLE IF EXISTS %s" % result_table)
    # PAI ml will create result table itself
    if model_type == EstimatorType.PAIML:
        return

    create_table_sql = "CREATE TABLE %s AS SELECT * FROM %s LIMIT 0" % (
        result_table, select)
    db.execute(conn, create_table_sql)

    # if label is not in data table, add a int column for it
    schema = db.get_table_schema(conn, result_table)
    col_type = "INT"
    for (name, ctype) in schema:
        if name == train_label_column or name == label_column:
            col_type = ctype
            break
    col_names = [col[0] for col in schema]
    if label_column not in col_names:
        db.execute(
            conn, "ALTER TABLE %s ADD %s %s" %
            (result_table, label_column, col_type))
    if train_label_column != label_column and train_label_column in col_names:
        db.execute(
            conn, "ALTER TABLE %s DROP COLUMN %s" %
            (result_table, train_label_column))