Ejemplo n.º 1
0
def shap_explain(booster,
                 datasource,
                 dataset,
                 summary_params,
                 result_table="",
                 is_pai=False,
                 oss_dest=None,
                 oss_ak=None,
                 oss_sk=None,
                 oss_endpoint=None,
                 oss_bucket_name=None):
    tree_explainer = shap.TreeExplainer(booster)
    shap_values = tree_explainer.shap_values(dataset)
    if result_table:
        if is_pai:
            conn = PaiIOConnection.from_table(result_table)
        else:
            conn = db.connect_with_data_source(datasource)
        # TODO(typhoonzero): the shap_values is may be a
        # list of shape [3, num_samples, num_features],
        # use the first dimension here, should find out
        # when to use the other two. When shap_values is
        # not a list it can be directly used.
        if isinstance(shap_values, list):
            to_write = shap_values[0]
        else:
            to_write = shap_values

        columns = list(dataset.columns)
        with db.buffered_db_writer(conn, result_table, columns) as w:
            for row in to_write:
                w.write(list(row))
        conn.close()

    if summary_params.get("plot_type") == "decision":
        shap_interaction_values = tree_explainer.shap_interaction_values(
            dataset)
        expected_value = tree_explainer.expected_value
        if isinstance(shap_interaction_values, list):
            shap_interaction_values = shap_interaction_values[0]
        if isinstance(expected_value, list):
            expected_value = expected_value[0]
        plot_func = lambda: shap.decision_plot(  # noqa: E731
            expected_value,
            shap_interaction_values,
            dataset,
            show=False,
            feature_display_range=slice(None, -40, -1),
            alpha=1)
    else:
        plot_func = lambda: shap.summary_plot(  # noqa: E731
            shap_values, dataset, show=False, **summary_params)

    explainer.plot_and_save(plot_func,
                            oss_dest=oss_dest,
                            oss_ak=oss_ak,
                            oss_sk=oss_sk,
                            oss_endpoint=oss_endpoint,
                            oss_bucket_name=oss_bucket_name,
                            filename='summary')
Ejemplo n.º 2
0
    def test_field_type(self):
        self.assertGreater(len(MYSQL_FIELD_TYPE_DICT), 0)

        addr = os.getenv("SQLFLOW_TEST_DB_MYSQL_ADDR", "localhost:3306")
        conn = connect_with_data_source(
            "mysql://*****:*****@tcp(%s)/?maxAllowedPacket=0" % addr)
        cursor = conn.cursor()

        table_name = "iris.test_mysql_field_type_table"
        drop_table_sql = "DROP TABLE IF EXISTS %s" % table_name
        create_table_sql = "CREATE TABLE IF NOT EXISTS " + \
                           table_name + "(a %s)"
        select_sql = "SELECT * FROM %s" % table_name

        for int_type, str_type in MYSQL_FIELD_TYPE_DICT.items():
            if str_type in ["VARCHAR", "CHAR"]:
                str_type += "(255)"

            cursor.execute(drop_table_sql)
            cursor.execute(create_table_sql % str_type)
            cursor.execute(select_sql)

            int_type_actual = cursor.description[0][1]
            cursor.execute(drop_table_sql)

            self.assertEqual(int_type_actual, int_type,
                             "%s not match" % str_type)
Ejemplo n.º 3
0
def load_db_data_to_data_frame(datasource, select):
    """
    Load database data to a pandas.DataFrame.

    Args:
        datasource (str): the database connection URI.
        select (str): the select SQL statement.

    Returns:
        A pandas.DataFrame object which contains all queried data.
    """
    conn = db.connect_with_data_source(datasource)
    generator = verifier.fetch_samples(conn, select, n=-1)
    names = generator.field_names
    dtypes = []
    for dtype in generator.field_types:
        if dtype in ['VARCHAR', 'CHAR', 'TEXT', 'STRING']:
            dtypes.append(np.str)
        else:
            dtypes.append(np.float64)

    df = pd.DataFrame(columns=names)
    for i, rows in enumerate(generator()):
        df.loc[i] = rows

    for name, dtype in zip(names, dtypes):
        df[name] = df[name].astype(dtype)

    conn.close()
    return df
Ejemplo n.º 4
0
def write_with_generator(datasource, table, gen):
    """Write data into a table, the written data
    comes from the input generator.

    Args:
        datasource: string
            The connection string to connectDBMS.
        table: string
            The table name written.
        gen: Generator
            The generator to generte the data to insert
            into table.
    """
    conn = connect_with_data_source(datasource)
    _drop_table_if_exists(conn, table)
    _create_table(conn, table)
    idx = 0

    with buffered_db_writer(conn, table, ["id", "block"]) as w:
        for d in gen():
            block = base64.b64encode(d)
            row = [idx, block]
            w.write(row)
            idx += 1

    conn.close()
Ejemplo n.º 5
0
def save_solved_result_in_db(solved_result, data_frame, variables,
                             result_value_name, datasource, result_table):
    column_names = []
    for col in data_frame.columns:
        found = False
        for var in variables:
            if var.lower() == col.lower():
                found = True
                break

        if found:
            column_names.append(col)

    data_frame = data_frame[[*column_names]]

    if len(variables) == 1 and variables[0].lower() == result_value_name.lower(
    ):
        result_value_name += "_value"

    column_names.append(result_value_name)
    data_frame[result_value_name] = solved_result

    conn = db.connect_with_data_source(datasource)
    with db.buffered_db_writer(conn.driver, conn, result_table,
                               column_names) as w:
        for i in six.moves.range(len(data_frame)):
            rows = list(data_frame.loc[i])
            w.write(rows)

    print('Solved result is:')
    print(data_frame)
    print('Saved in {}.'.format(result_table))
Ejemplo n.º 6
0
def submit_local_evaluate(datasource,
                          original_sql,
                          select,
                          label_name,
                          model,
                          model_params,
                          result_table,
                          user=""):
    model = Model.load_from_db(datasource, model)
    if model.get_type() == EstimatorType.XGBOOST:
        evaluate_func = xgboost_evaluate
        validation_metrics = model_params.get("validation.metrics",
                                              "accuracy_score")
    else:
        evaluate_func = tf_evaluate
        validation_metrics = model_params.get("validation.metrics", "Accuracy")

    conn = db.connect_with_data_source(datasource)
    validation_metrics = [m.strip() for m in validation_metrics.split(",")]
    result_column_names = create_evaluate_table(conn, result_table,
                                                validation_metrics)
    conn.close()

    evaluate_func(datasource=datasource,
                  select=select,
                  result_table=result_table,
                  model=model,
                  label_name=label_name,
                  model_params=model_params,
                  result_column_names=result_column_names)
Ejemplo n.º 7
0
def xgb_native_explain(booster, datasource, result_table):
    if not result_table:
        raise ValueError(
            "XGBoostExplainer must use with INTO to output result to a table.")

    gain_map = booster.get_score(importance_type="gain")
    fscore_map = booster.get_fscore()
    conn = db.connect_with_data_source(datasource)

    all_feature_keys = list(gain_map.keys())
    all_feature_keys.sort()
    columns = ["feature", "fscore", "gain"]
    dtypes = [
        DataType.to_db_field_type(conn.driver, DataType.STRING),
        DataType.to_db_field_type(conn.driver, DataType.FLOAT32),
        DataType.to_db_field_type(conn.driver, DataType.FLOAT32),
    ]
    _create_table(conn, result_table, columns, dtypes)

    with db.buffered_db_writer(conn, result_table, columns) as w:
        for fkey in all_feature_keys:
            row = [fkey, fscore_map[fkey], gain_map[fkey]]
            w.write(list(row))

    conn.close()
Ejemplo n.º 8
0
def submit_local_explain(datasource,
                         original_sql,
                         select,
                         model,
                         model_params,
                         result_table,
                         explainer="TreeExplainer",
                         user=""):
    model = Model.load_from_db(datasource, model)
    if model.get_type() == EstimatorType.XGBOOST:
        explain_func = xgboost_explain
    else:
        explain_func = tf_explain

    if result_table:
        feature_columns = model.get_meta("features")
        estimator_string = model.get_meta("class_name")
        field_descs = get_ordered_field_descs(feature_columns)
        feature_column_names = [fd.name for fd in field_descs]
        with db.connect_with_data_source(datasource) as conn:
            create_explain_table(conn, model.get_type(), explainer,
                                 estimator_string, result_table,
                                 feature_column_names)

    explain_func(datasource=datasource,
                 select=select,
                 explainer=explainer,
                 model_params=model_params,
                 result_table=result_table,
                 model=model)
    if not result_table:
        print_image_as_base64_html("summary.png")
Ejemplo n.º 9
0
    def test_hive(self):
        driver = os.environ.get('SQLFLOW_TEST_DB')
        if driver == "hive":
            host = "127.0.0.1"
            port = "10000"
            conn = connect(driver,
                           "iris",
                           user="******",
                           password="******",
                           host=host,
                           port=port)
            self._do_test(driver,
                          conn,
                          hdfs_namenode_addr="127.0.0.1:8020",
                          hive_location="/sqlflow")
            conn.close()

            conn = connect_with_data_source(
                "hive://*****:*****@127.0.0.1:10000/iris")
            self._do_test(driver, conn)
            self._do_test_hive_specified_db(
                driver,
                conn,
                hdfs_namenode_addr="127.0.0.1:8020",
                hive_location="/sqlflow")
            conn.close()
Ejemplo n.º 10
0
def read_with_generator(datasource, table):
    """Read data from a table, this function returns
    a generator to yield the data.

    Args:
        datasource: string
            The connection string to connect DBMS.
        table: string
            The table name read.
    Returns: Generator
        the generator yield row data of the table.
    """
    conn = connect_with_data_source(datasource)
    sql = "SELECT id, block FROM {0} ORDER BY id".format(table)
    cursor = conn.cursor()
    cursor.execute(sql)
    fetch_size = 100

    def reader():
        while True:
            rows = cursor.fetchmany(size=fetch_size)
            if not rows:
                break
            for r in rows:
                yield base64.b64decode(r[1])
        conn.close()

    return reader
Ejemplo n.º 11
0
def get_explain_random_forest_pai_cmd(datasource, model_name, data_table,
                                      result_table, label_column):
    """Get a command to submit a PAI RandomForest explain task

    Args:
        datasource: current datasoruce
        model_name: model name on PAI
        data_table: input data table name
        result_table: name of the result table, PAI will automatically
            create this table
        label_column: name of the label column

    Returns:
        A string which is a PAI cmd
    """
    # NOTE(typhoonzero): for PAI random forests predicting, we can not load
    # the TrainStmt since the model saving is fully done by PAI. We directly
    # use the columns in SELECT statement for prediction, error will be
    # reported by PAI job if the columns not match.
    if not label_column:
        return ("must specify WITH label_column when using "
                "pai random forest to explain models")

    conn = db.connect_with_data_source(datasource)
    schema = db.get_table_schema(conn, data_table)
    columns = [f[0] for f in schema]
    conn.execute("DROP TABLE IF EXISTS %s;" % result_table)
    return (
        """pai -name feature_importance -project algo_public """
        """-DmodelName="%s" -DinputTableName="%s"  -DoutputTableName="%s" """
        """-DlabelColName="%s" -DfeatureColNames="%s" """
    ) % (model_name, data_table, result_table, label_column, ",".join(columns))
Ejemplo n.º 12
0
def get_explain_random_forests_cmd(datasource, model_name, data_table,
                                   result_table, label_column):
    """Get PAI random forest explanation command

    Args:
        datasource: current datasoruce
        model_name: model name on PAI
        data_table: input data table name
        result_table: result table name
        label_column: name of the label column

    Returns:
        a PAI cmd to explain the data using given model
    """
    # NOTE(typhoonzero): for PAI random forests predicting, we can not load
    # the TrainStmt since the model saving is fully done by PAI. We directly
    # use the columns in SELECT statement for prediction, error will be
    # reported by PAI job if the columns not match.
    if not label_column:
        raise SQLFlowDiagnostic("must specify WITH label_column when using "
                                "pai random forest to explain models")

    conn = db.connect_with_data_source(datasource)
    # drop result table if exists
    conn.execute("DROP TABLE IF EXISTS %s;" % result_table)
    schema = db.get_table_schema(conn, data_table)
    fields = [f[0] for f in schema if f[0] != label_column]
    return ('''pai -name feature_importance -project algo_public '''
            '''-DmodelName="%s" -DinputTableName="%s"  '''
            '''-DoutputTableName="%s" -DlabelColName="%s" '''
            '''-DfeatureColNames="%s" ''') % (model_name, data_table,
                                              result_table, label_column,
                                              ",".join(fields))
Ejemplo n.º 13
0
    def test_field_type(self):
        self.assertGreater(len(MYSQL_FIELD_TYPE_DICT), 0)

        conn = connect_with_data_source(testing.get_datasource())

        table_name = "iris.test_mysql_field_type_table"
        drop_table_sql = "DROP TABLE IF EXISTS %s" % table_name
        create_table_sql = "CREATE TABLE IF NOT EXISTS " + \
                           table_name + "(a %s)"
        select_sql = "SELECT * FROM %s" % table_name

        for int_type, str_type in MYSQL_FIELD_TYPE_DICT.items():
            if str_type in ["VARCHAR", "CHAR"]:
                str_type += "(255)"

            conn.execute(drop_table_sql)
            conn.execute(create_table_sql % str_type)
            # we are meant to use low layer cursor here to
            # check the type value with the real value returned by mysql
            cursor = conn.cursor()
            cursor.execute(select_sql)
            int_type_actual = cursor.description[0][1]
            cursor.close()
            conn.execute(drop_table_sql)

            self.assertEqual(int_type_actual, int_type,
                             "%s not match" % str_type)
Ejemplo n.º 14
0
def _create_result_table(datasource, select, variables, result_value_name,
                         variable_type, result_table):
    if variable_type.endswith('Integers') or variable_type == "Binary":
        result_type = DataType.INT64
    elif variable_type.endswith('Reals'):
        result_type = DataType.FLOAT32
    else:
        raise ValueError("unsupported variable type %s" % variable_type)

    conn = db.connect_with_data_source(datasource)
    name_and_types = dict(db.selected_columns_and_types(conn, select))
    columns = []
    for var in variables:
        field_type = db.to_db_field_type(conn.driver, name_and_types.get(var))
        columns.append("%s %s" % (var, field_type))

    if len(variables) == 1 and variables[0].lower() == result_value_name.lower(
    ):
        result_value_name += "_value"

    columns.append("%s %s" %
                   (result_value_name,
                    DataType.to_db_field_type(conn.driver, result_type)))
    column_str = ",".join(columns)

    conn.execute("DROP TABLE IF EXISTS %s" % result_table)
    create_sql = "CREATE TABLE %s (%s)" % (result_table, column_str)
    conn.execute(create_sql)
    conn.close()
Ejemplo n.º 15
0
def run_optimize(datasource, select, variables, result_value_name,
                 variable_type, objective, direction, constraints, solver,
                 result_table, submitter, user_id):
    if submitter == "local":
        _create_result_table(datasource, select, variables, result_value_name,
                             variable_type, result_table)
        return run_optimize_locally(datasource=datasource,
                                    select=select,
                                    variables=variables,
                                    variable_type=variable_type,
                                    result_value_name=result_value_name,
                                    objective=objective,
                                    direction=direction,
                                    constraints=constraints,
                                    solver=solver,
                                    result_table=result_table)
    else:
        with create_tmp_tables_guard(select, datasource) as train_table:
            with db.connect_with_data_source(datasource) as conn:
                schema = conn.get_table_schema(train_table)
                columns = [s[0] for s in schema]
                conn.execute("DROP TABLE IF EXISTS %s;" % result_table)

            return run_optimize_on_optflow(train_table=train_table,
                                           columns=columns,
                                           variables=variables,
                                           variable_type=variable_type,
                                           result_value_name=result_value_name,
                                           objective=objective,
                                           direction=direction,
                                           constraints=constraints,
                                           solver=solver,
                                           result_table=result_table,
                                           user_id=user_id)
Ejemplo n.º 16
0
def shap_explain(booster, datasource, dataset, summary_params, result_table):

    tree_explainer = shap.TreeExplainer(booster)
    shap_values = tree_explainer.shap_values(dataset)
    if result_table:
        conn = db.connect_with_data_source(datasource)
        # TODO(typhoonzero): the shap_values is may be a
        # list of shape [3, num_samples, num_features],
        # use the first dimension here, should find out
        # when to use the other two. When shap_values is
        # not a list it can be directly used.
        if isinstance(shap_values, list):
            to_write = shap_values[0]
        else:
            to_write = shap_values

        columns = list(dataset.columns)
        dtypes = [DataType.to_db_field_type(conn.driver, DataType.FLOAT32)
                  ] * len(columns)
        _create_table(conn, result_table, columns, dtypes)
        with db.buffered_db_writer(conn, result_table, columns) as w:
            for row in to_write:
                w.write(list(row))

        conn.close()

    if summary_params.get("plot_type") == "decision":
        shap_interaction_values = tree_explainer.shap_interaction_values(
            dataset)
        expected_value = tree_explainer.expected_value
        if isinstance(shap_interaction_values, list):
            shap_interaction_values = shap_interaction_values[0]
        if isinstance(expected_value, list):
            expected_value = expected_value[0]

        plot_func = lambda: shap.decision_plot(  # noqa: E731
            expected_value,
            shap_interaction_values,
            dataset,
            show=False,
            feature_display_range=slice(None, -40, -1),
            alpha=1)
    else:
        plot_func = lambda: shap.summary_plot(  # noqa: E731
            shap_values, dataset, show=False, **summary_params)

    filename = 'summary.png'
    with temp_file.TemporaryDirectory(as_cwd=True):
        explainer.plot_and_save(plot_func, filename=filename)
        with open(filename, 'rb') as f:
            img = f.read()

    img = base64.b64encode(img)
    if six.PY3:
        img = img.decode('utf-8')
    img = "<div align='center'><img src='data:image/png;base64,%s' /></div>" \
          % img
    print(img)
Ejemplo n.º 17
0
def input_fn(select,
             datasource,
             feature_column_names,
             feature_metas,
             label_meta,
             is_pai=False,
             pai_table="",
             num_workers=1,
             worker_id=0):
    feature_types = []
    shapes = []
    for name in feature_column_names:
        # NOTE: vector columns like 23,21,3,2,0,0 should use shape None
        if feature_metas[name]["is_sparse"]:
            if feature_metas[name]["delimiter_kv"]:
                # extract two features from generator data.
                feature_types.append(
                    (get_dtype(feature_metas[name]["dtype"]),
                     get_dtype(feature_metas[name]["dtype_weight"]), tf.int64))
                shapes.append((None, None, None))
            else:
                feature_types.append((tf.int64, tf.int32, tf.int64))
                shapes.append((None, None, None))
        else:
            feature_types.append(get_dtype(feature_metas[name]["dtype"]))
            shapes.append(feature_metas[name]["shape"])
    if is_pai:
        pai_table = "odps://{}/tables/{}".format(*pai_table.split("."))
        return pai_dataset(pai_table,
                           feature_column_names,
                           label_meta,
                           feature_metas,
                           slice_id=worker_id,
                           slice_count=num_workers)
    else:
        conn = db.connect_with_data_source(datasource)
        gen = db.db_generator(conn, select, label_meta)
        selected_cols = db.selected_cols(conn, select)

    gen = tf_generator(gen, selected_cols, feature_column_names, feature_metas)

    # Clustering model do not have label
    if not label_meta or label_meta["feature_name"] == "":
        dataset = tf.data.Dataset.from_generator(gen, (tuple(feature_types), ),
                                                 (tuple(shapes), ))
        ds_mapper = functools.partial(
            parse_sparse_feature_predict,
            feature_column_names=feature_column_names,
            feature_metas=feature_metas)
    else:
        dataset = tf.data.Dataset.from_generator(
            gen, (tuple(feature_types), eval("tf.%s" % label_meta["dtype"])),
            (tuple(shapes), label_meta["shape"]))
        ds_mapper = functools.partial(
            parse_sparse_feature,
            feature_column_names=feature_column_names,
            feature_metas=feature_metas)
    return dataset.map(ds_mapper)
Ejemplo n.º 18
0
def evaluate(datasource,
             estimator_string,
             select,
             result_table,
             feature_columns,
             feature_column_names,
             feature_metas={},
             label_meta={},
             model_params={},
             validation_metrics=["Accuracy"],
             save="",
             batch_size=1,
             validation_steps=None,
             verbose=0,
             pai_table=""):
    FLAGS = define_tf_flags()
    set_oss_environs(FLAGS)

    estimator_cls = import_model(estimator_string)
    is_estimator = is_tf_estimator(estimator_cls)
    set_log_level(verbose, is_estimator)

    is_pai = True if pai_table else False
    eval_dataset = get_dataset_fn(select,
                                  datasource,
                                  feature_column_names,
                                  feature_metas,
                                  label_meta,
                                  is_pai=is_pai,
                                  pai_table=pai_table,
                                  batch_size=batch_size)

    model_params.update(feature_columns)
    pop_optimizer_and_loss(model_params)
    if is_estimator:
        with open("exported_path", "r") as fid:
            exported_path = str(fid.read())

        model_params["warm_start_from"] = exported_path
        estimator = estimator_cls(**model_params)
        result_metrics = estimator_evaluate(estimator, eval_dataset,
                                            validation_metrics)
    else:
        keras_model = init_model_with_feature_column(estimator_cls,
                                                     model_params)
        keras_model_pkg = sys.modules[estimator_cls.__module__]
        result_metrics = keras_evaluate(keras_model, eval_dataset, save,
                                        keras_model_pkg, validation_metrics)

    if result_table:
        metric_name_list = ["loss"] + validation_metrics
        if is_pai:
            conn = PaiIOConnection.from_table(result_table)
        else:
            conn = db.connect_with_data_source(datasource)
        write_result_metrics(result_metrics, metric_name_list, result_table,
                             conn)
        conn.close()
Ejemplo n.º 19
0
def pred(datasource,
         select,
         feature_metas,
         feature_column_names,
         train_label_meta,
         pred_label_meta,
         result_table,
         is_pai=False,
         hdfs_namenode_addr="",
         hive_location="",
         hdfs_user="",
         hdfs_pass="",
         pai_table="",
         model_params=None,
         train_params=None,
         transform_fn=None,
         feature_column_code=""):
    if not is_pai:
        conn = db.connect_with_data_source(datasource)
    else:
        conn = None
    dpred = xgb_dataset(
        datasource=datasource,
        fn='predict.txt',
        dataset_sql=select,
        feature_metas=feature_metas,
        feature_column_names=feature_column_names,
        label_meta=None,
        is_pai=is_pai,
        pai_table=pai_table,
        pai_single_file=True,
        cache=True,
        batch_size=DEFAULT_PREDICT_BATCH_SIZE,
        transform_fn=transform_fn,
        feature_column_code=feature_column_code,
        raw_data_dir="predict.raw.dir")  # NOTE: default to use external memory
    bst = xgb.Booster({'nthread': 4})  # init model
    bst.load_model("my_model")  # load data
    print("Start predicting XGBoost model...")

    if is_pai:
        pai_table = "odps://{}/tables/{}".format(*pai_table.split("."))
        selected_cols = db.pai_selected_cols(pai_table)
    else:
        selected_cols = db.selected_cols(conn, select)

    feature_file_id = 0
    train_label_name = train_label_meta["feature_name"]
    pred_label_name = pred_label_meta["feature_name"]
    for pred_dmatrix in dpred:
        predict_and_store_result(bst, pred_dmatrix, feature_file_id,
                                 model_params, selected_cols, train_label_name,
                                 pred_label_name, feature_column_names,
                                 feature_metas, is_pai, conn, result_table,
                                 hdfs_namenode_addr, hive_location, hdfs_user,
                                 hdfs_pass)
        feature_file_id += 1
    print("Done predicting. Predict table : %s" % result_table)
Ejemplo n.º 20
0
def shap_explain(datasource,
                 select,
                 feature_field_meta,
                 feature_column_names,
                 label_meta,
                 summary_params,
                 result_table="",
                 is_pai=False,
                 pai_explain_table="",
                 oss_dest=None,
                 oss_ak=None,
                 oss_sk=None,
                 oss_endpoint=None,
                 oss_bucket_name=None,
                 transform_fn=None,
                 feature_column_code=""):
    x = xgb_shap_dataset(datasource,
                         select,
                         feature_column_names,
                         label_meta,
                         feature_field_meta,
                         is_pai,
                         pai_explain_table,
                         transform_fn=transform_fn,
                         feature_column_code=feature_column_code)
    shap_values, shap_interaction_values, expected_value = xgb_shap_values(x)
    if result_table != "":
        if is_pai:
            from runtime.dbapi.paiio import PaiIOConnection
            conn = PaiIOConnection.from_table(result_table)
        else:
            conn = db.connect_with_data_source(datasource)
        # TODO(typhoonzero): the shap_values is may be a
        # list of shape [3, num_samples, num_features],
        # use the first dimension here, should find out
        # when to use the other two. When shap_values is
        # not a list it can be directly used.
        if isinstance(shap_values, list):
            to_write = shap_values[0]
        else:
            to_write = shap_values
        write_shap_values(to_write, conn, result_table, feature_column_names)

    if summary_params.get("plot_type") == "decision":
        explainer.plot_and_save(
            lambda: shap.decision_plot(expected_value,
                                       shap_interaction_values,
                                       x,
                                       show=False,
                                       feature_display_range=slice(
                                           None, -40, -1),
                                       alpha=1), oss_dest, oss_ak, oss_sk,
            oss_endpoint, oss_bucket_name)
    else:
        explainer.plot_and_save(
            lambda: shap.summary_plot(
                shap_values, x, show=False, **summary_params), oss_dest,
            oss_ak, oss_sk, oss_endpoint, oss_bucket_name)
Ejemplo n.º 21
0
def pred_imp(datasource,
             select,
             feature_metas,
             feature_column_names,
             train_label_meta,
             pred_label_meta,
             result_table,
             is_pai=False,
             pai_table="",
             model_params=None,
             train_params=None,
             transform_fn=None,
             feature_column_code="",
             rank=0,
             nworkers=1):
    print("rank={} nworkers={}".format(rank, nworkers))
    if not is_pai:
        conn = db.connect_with_data_source(datasource)
    else:
        conn = PaiIOConnection.from_table(pai_table)
    dpred = xgb_dataset(
        datasource=datasource,
        fn='predict.txt',
        dataset_sql=select,
        feature_metas=feature_metas,
        feature_column_names=feature_column_names,
        label_meta=None,
        is_pai=is_pai,
        pai_table=pai_table,
        pai_single_file=True,
        cache=True,
        batch_size=DEFAULT_PREDICT_BATCH_SIZE,
        rank=rank,
        nworkers=nworkers,
        transform_fn=transform_fn,
        feature_column_code=feature_column_code,
        raw_data_dir="predict.raw.dir")  # NOTE: default to use external memory
    bst = xgb.Booster({'nthread': 4})  # init model
    bst.load_model("my_model")  # load data
    print("{} Start predicting XGBoost model...".format(datetime.now()))
    if not model_params:
        model_params = load_metadata("model_meta.json")["attributes"]

    selected_cols = db.selected_cols(conn, select)

    feature_file_id = 0
    train_label_name = train_label_meta["feature_name"]
    pred_label_name = pred_label_meta["feature_name"]
    for pred_dmatrix in dpred:
        predict_and_store_result(bst, pred_dmatrix, feature_file_id,
                                 model_params, selected_cols, train_label_name,
                                 pred_label_name, feature_column_names,
                                 feature_metas, is_pai, conn, result_table,
                                 rank)
        feature_file_id += 1
    print("{} Done predicting. Predict table: {}".format(
        datetime.now(), result_table))
Ejemplo n.º 22
0
def evaluate(datasource,
             estimator_string,
             select,
             result_table,
             feature_columns,
             feature_column_names,
             feature_metas={},
             label_meta={},
             model_params={},
             validation_metrics=["Accuracy"],
             save="",
             batch_size=1,
             validation_steps=None,
             verbose=0,
             hdfs_namenode_addr="",
             hive_location="",
             hdfs_user="",
             hdfs_pass=""):
    estimator_cls = import_model(estimator_string)
    is_estimator = is_tf_estimator(estimator_cls)
    set_log_level(verbose, is_estimator)
    eval_dataset = get_dataset_fn(select,
                                  datasource,
                                  feature_column_names,
                                  feature_metas,
                                  label_meta,
                                  is_pai=False,
                                  pai_table="",
                                  batch_size=batch_size)

    model_params.update(feature_columns)
    if is_estimator:
        model_params["model_dir"] = save
        estimator = estimator_cls(**model_params)
        result_metrics = estimator_evaluate(estimator, eval_dataset,
                                            validation_metrics)
    else:
        keras_model = init_model_with_feature_column(estimator_cls,
                                                     model_params)
        keras_model_pkg = sys.modules[estimator_cls.__module__]
        result_metrics = keras_evaluate(keras_model, eval_dataset, save,
                                        keras_model_pkg, validation_metrics)

    # write result metrics to a table
    conn = connect_with_data_source(datasource)
    driver = conn.driver
    if result_table:
        metric_name_list = ["loss"] + validation_metrics
        write_result_metrics(result_metrics,
                             metric_name_list,
                             result_table,
                             driver,
                             conn,
                             hdfs_namenode_addr=hdfs_namenode_addr,
                             hive_location=hive_location,
                             hdfs_user=hdfs_user,
                             hdfs_pass=hdfs_pass)
Ejemplo n.º 23
0
def explain(datasource,
            select,
            feature_field_meta,
            feature_column_names,
            label_meta,
            summary_params,
            explainer="TreeExplainer",
            result_table="",
            is_pai=False,
            pai_explain_table="",
            oss_dest=None,
            oss_ak=None,
            oss_sk=None,
            oss_endpoint=None,
            oss_bucket_name=None,
            transform_fn=None,
            feature_column_code=""):
    if explainer == "XGBoostExplainer":
        if result_table == "":
            raise ValueError("""XGBoostExplainer must use with INTO to output
result to a table.""")
        bst = xgb.Booster()
        bst.load_model("my_model")
        gain_map = bst.get_score(importance_type="gain")
        fscore_map = bst.get_fscore()
        if is_pai:
            from runtime.dbapi.paiio import PaiIOConnection
            conn = PaiIOConnection.from_table(result_table)
        else:
            conn = db.connect_with_data_source(datasource)

        all_feature_keys = list(gain_map.keys())
        all_feature_keys.sort()
        with db.buffered_db_writer(conn, result_table,
                                   ["feature", "fscore", "gain"], 100) as w:
            for fkey in all_feature_keys:
                row = [fkey, fscore_map[fkey], gain_map[fkey]]
                w.write(list(row))
    else:
        # when explainer is "" or "TreeExplainer" use SHAP by default.
        shap_explain(datasource,
                     select,
                     feature_field_meta,
                     feature_column_names,
                     label_meta,
                     summary_params,
                     result_table=result_table,
                     is_pai=is_pai,
                     pai_explain_table=pai_explain_table,
                     oss_dest=oss_dest,
                     oss_ak=oss_ak,
                     oss_sk=oss_sk,
                     oss_endpoint=oss_endpoint,
                     oss_bucket_name=oss_bucket_name,
                     transform_fn=transform_fn,
                     feature_column_code=feature_column_code)
Ejemplo n.º 24
0
def evaluate_step(datasource,
                  select,
                  result_table,
                  model,
                  label_name,
                  model_params,
                  pai_table=None):
    if isinstance(model, six.string_types):
        model = Model.load_from_db(datasource, model)
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)

    if model_params is None:
        model_params = {}

    validation_metrics = model_params.get("validation.metrics", "Accuracy")
    validation_metrics = [m.strip() for m in validation_metrics.split(',')]
    validation_steps = model_params.get("validation.steps", None)
    batch_size = model_params.get("validation.batch_size", 1)
    verbose = model_params.get("validation.verbose", 0)

    conn = db.connect_with_data_source(datasource)
    create_evaluate_table(conn, result_table, validation_metrics)
    conn.close()

    model_params = model.get_meta("attributes")
    train_fc_map = model.get_meta("features")
    train_label_desc = model.get_meta("label").get_field_desc()[0]
    estimator_string = model.get_meta("class_name")
    save = "model_save"

    field_descs = get_ordered_field_descs(train_fc_map)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])
    feature_columns = compile_ir_feature_columns(train_fc_map,
                                                 model.get_type())
    train_label_desc.name = label_name
    label_meta = train_label_desc.to_dict(dtype_to_string=True)

    _evaluate(datasource=datasource,
              estimator_string=estimator_string,
              select=select,
              result_table=result_table,
              feature_columns=feature_columns,
              feature_column_names=feature_column_names,
              feature_metas=feature_metas,
              label_meta=label_meta,
              model_params=model_params,
              validation_metrics=validation_metrics,
              save=save,
              batch_size=batch_size,
              validation_steps=validation_steps,
              verbose=verbose,
              pai_table=pai_table)
Ejemplo n.º 25
0
def pred(datasource,
         estimator_string,
         select,
         result_table,
         feature_columns,
         feature_column_names,
         feature_column_names_map,
         train_label_name,
         result_col_name,
         feature_metas={},
         model_params={},
         pred_params={},
         save="",
         batch_size=1,
         pai_table=""):
    estimator = import_model(estimator_string)
    model_params.update(feature_columns)
    is_estimator = is_tf_estimator(estimator)

    if pai_table != "":
        conn = PaiIOConnection.from_table(pai_table)
        selected_cols = db.selected_cols(conn, None)
        predict_generator = db.db_generator(conn, None)
    else:
        conn = db.connect_with_data_source(datasource)
        selected_cols = db.selected_cols(conn, select)
        predict_generator = db.db_generator(conn, select)

    pop_optimizer_and_loss(model_params)

    if pred_params is None:
        extra_result_cols = []
    else:
        extra_result_cols = pred_params.get("extra_outputs", "")
        extra_result_cols = [
            c.strip() for c in extra_result_cols.split(",") if c.strip()
        ]

    if not is_estimator:
        if not issubclass(estimator, tf.keras.Model):
            # functional model need field_metas parameter
            model_params["field_metas"] = feature_metas
        print("Start predicting using keras model...")
        keras_predict(estimator, model_params, save, result_table,
                      feature_column_names, feature_metas, train_label_name,
                      result_col_name, conn, predict_generator, selected_cols,
                      extra_result_cols)
    else:
        # TODO(sneaxiy): support extra_result_cols for estimator
        model_params['model_dir'] = save
        print("Start predicting using estimator model...")
        estimator_predict(result_table, feature_column_names, feature_metas,
                          train_label_name, result_col_name, conn,
                          predict_generator, selected_cols)

    print("Done predicting. Predict table : %s" % result_table)
Ejemplo n.º 26
0
def drop_tables(tables, datasource):
    """Drop given tables in datasource"""
    conn = db.connect_with_data_source(datasource)
    try:
        for table in tables:
            if table != "":
                drop_sql = "DROP TABLE IF EXISTS %s" % table
                conn.execute(drop_sql)
    except:  # noqa: E722
        # odps will clear table itself, so even fail here, we do
        # not need to raise error
        print("Encounter error on drop tmp table")
Ejemplo n.º 27
0
def create_explain_result_table(datasource, data_table, result_table,
                                model_type, estimator, label_column):
    """Create explain result table from given datasource

    Args:
        datasource: current datasource
        data_table: input data table name
        result_table: table name to store the result
        model_type: type of the model to use
        estimator: estimator class if the model is TensorFlow estimator
        label_column: column name of the predict label
    """
    conn = db.connect_with_data_source(datasource)
    drop_stmt = "DROP TABLE IF EXISTS %s" % result_table
    conn.execute(drop_stmt)

    create_stmt = ""
    if model_type == EstimatorType.PAIML:
        return
    elif model_type == EstimatorType.TENSORFLOW:
        if estimator.startswith("BoostedTrees"):
            column_def = ""
            if conn.driver == "mysql":
                column_def = "(feature VARCHAR(255), dfc FLOAT, gain FLOAT)"
            else:
                # Hive & MaxCompute
                column_def = "(feature STRING, dfc STRING, gain STRING)"
            create_stmt = "CREATE TABLE IF NOT EXISTS %s %s;" % (result_table,
                                                                 column_def)
        else:
            if not label_column:
                raise SQLFlowDiagnostic(
                    "need to specify WITH label_col=lable_col_name "
                    "when explaining deep models")
            create_stmt = get_create_shap_result_sql(conn, data_table,
                                                     result_table,
                                                     label_column)
    elif model_type == EstimatorType.XGBOOST:
        if not label_column:
            raise SQLFlowDiagnostic(
                "need to specify WITH label_col=lable_col_name "
                "when explaining xgboost models")
        create_stmt = get_create_shap_result_sql(conn, data_table,
                                                 result_table, label_column)
    else:
        raise SQLFlowDiagnostic(
            "not supported modelType %d for creating Explain result table" %
            model_type)

    if not conn.execute(create_stmt):
        raise SQLFlowDiagnostic("Can't create explain result table")
Ejemplo n.º 28
0
    def test_mysql(self):
        driver = os.environ.get('SQLFLOW_TEST_DB')
        if driver == "mysql":
            user, password, host, port, database = testing_mysql_cfg()
            conn = connect(driver,
                           database,
                           user=user,
                           password=password,
                           host=host,
                           port=port)
            self._do_test(driver, conn)

            conn = connect_with_data_source(testing_mysql_db_url())
            self._do_test(driver, conn)
Ejemplo n.º 29
0
def explain(datasource,
            estimator_string,
            select,
            feature_columns,
            feature_column_names,
            feature_metas={},
            label_meta={},
            model_params={},
            save="",
            pai_table="",
            plot_type='bar',
            result_table="",
            hdfs_namenode_addr="",
            hive_location="",
            hdfs_user="",
            hdfs_pass="",
            oss_dest=None,
            oss_ak=None,
            oss_sk=None,
            oss_endpoint=None,
            oss_bucket_name=None):
    estimator_cls = import_model(estimator_string)
    model_params['model_dir'] = save
    model_params.update(feature_columns)

    def _input_fn():
        dataset = input_fn(select, datasource, feature_column_names,
                           feature_metas, label_meta)
        return dataset.batch(1).cache()

    estimator = init_model_with_feature_column(estimator_cls, model_params)
    conn = connect_with_data_source(datasource)

    if estimator_cls in (tf.estimator.BoostedTreesClassifier,
                         tf.estimator.BoostedTreesRegressor):
        explain_boosted_trees(datasource, estimator, _input_fn, plot_type,
                              result_table, feature_column_names, conn.driver,
                              conn, hdfs_namenode_addr, hive_location,
                              hdfs_user, hdfs_pass, oss_dest, oss_ak, oss_sk,
                              oss_endpoint, oss_bucket_name)
    else:
        shap_dataset = pd.DataFrame(columns=feature_column_names)
        for i, (features, label) in enumerate(_input_fn()):
            shap_dataset.loc[i] = [
                item.numpy()[0][0] for item in features.values()
            ]
        explain_dnns(datasource, estimator, shap_dataset, plot_type,
                     result_table, feature_column_names, conn.driver, conn,
                     hdfs_namenode_addr, hive_location, hdfs_user, hdfs_pass,
                     oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name)
Ejemplo n.º 30
0
def read_with_generator_and_metadata(datasource,
                                     table,
                                     buff_size=256,
                                     meta_only=False):
    """Read data from a table, this function returns
    a generator to yield the data, and the metadata dict.

    Args:
        datasource: string
            The connection string to connect DBMS.
        table: string
            The table name read.
        buff_size: int
            The buffer size to read data.
        meta_only: bool
            Only read the metadata.

    Returns: tuple(Generator, dict)
        the generator yield row data of the table,
        and the model metadata dict.
    """
    conn = connect_with_data_source(datasource)
    r = SQLFSReader(conn, table, not meta_only)
    metadata = _read_metadata(r)

    if meta_only:
        r.close()
        return None, metadata

    def reader():
        try:
            while True:
                buffer = r.read(buff_size)
                if not buffer:
                    break

                yield buffer
        finally:
            reader.close()

    def close():
        if not reader.is_closed:
            r.close()
            conn.close()
            reader.is_closed = True

    reader.is_closed = False
    reader.close = close

    return reader, metadata