Exemple #1
0
def shap_explain(booster,
                 datasource,
                 dataset,
                 summary_params,
                 result_table="",
                 is_pai=False,
                 oss_dest=None,
                 oss_ak=None,
                 oss_sk=None,
                 oss_endpoint=None,
                 oss_bucket_name=None):
    tree_explainer = shap.TreeExplainer(booster)
    shap_values = tree_explainer.shap_values(dataset)
    if result_table:
        if is_pai:
            conn = PaiIOConnection.from_table(result_table)
        else:
            conn = db.connect_with_data_source(datasource)
        # TODO(typhoonzero): the shap_values is may be a
        # list of shape [3, num_samples, num_features],
        # use the first dimension here, should find out
        # when to use the other two. When shap_values is
        # not a list it can be directly used.
        if isinstance(shap_values, list):
            to_write = shap_values[0]
        else:
            to_write = shap_values

        columns = list(dataset.columns)
        with db.buffered_db_writer(conn, result_table, columns) as w:
            for row in to_write:
                w.write(list(row))
        conn.close()

    if summary_params.get("plot_type") == "decision":
        shap_interaction_values = tree_explainer.shap_interaction_values(
            dataset)
        expected_value = tree_explainer.expected_value
        if isinstance(shap_interaction_values, list):
            shap_interaction_values = shap_interaction_values[0]
        if isinstance(expected_value, list):
            expected_value = expected_value[0]
        plot_func = lambda: shap.decision_plot(  # noqa: E731
            expected_value,
            shap_interaction_values,
            dataset,
            show=False,
            feature_display_range=slice(None, -40, -1),
            alpha=1)
    else:
        plot_func = lambda: shap.summary_plot(  # noqa: E731
            shap_values, dataset, show=False, **summary_params)

    explainer.plot_and_save(plot_func,
                            oss_dest=oss_dest,
                            oss_ak=oss_ak,
                            oss_sk=oss_sk,
                            oss_endpoint=oss_endpoint,
                            oss_bucket_name=oss_bucket_name,
                            filename='summary')
Exemple #2
0
def pai_download_table_data_worker(dname, feature_metas, feature_column_names,
                                   label_meta, pai_table, slice_id,
                                   slice_count, feature_column_code,
                                   raw_data_dir):
    import runtime.xgboost as xgboost_extended
    if isinstance(feature_column_code, dict):
        # NOTE(typhoonzero): feature_column_code is a dict of
        # runtime.feature.column in refactored step code.
        feature_column_transformers = compile_ir_feature_columns(
            feature_column_code, EstimatorType.XGBOOST)
        transform_fn = \
            xgboost_extended.feature_column.ComposedColumnTransformer(
                feature_column_names,
                *feature_column_transformers["feature_columns"])
    else:
        feature_column_transformers = eval('[{}]'.format(feature_column_code))
        transform_fn = \
            xgboost_extended.feature_column.ComposedColumnTransformer(
                feature_column_names, *feature_column_transformers)

    conn = PaiIOConnection.from_table(pai_table, slice_id, slice_count)
    gen = db.db_generator(conn, None, label_meta=label_meta)()
    selected_cols = db.selected_cols(conn, None)
    filename = "{}/{}.txt".format(dname, slice_id)
    dump_dmatrix(filename,
                 gen,
                 feature_column_names,
                 feature_metas,
                 label_meta,
                 selected_cols,
                 transform_fn=transform_fn,
                 raw_data_dir=raw_data_dir)
Exemple #3
0
def evaluate(datasource,
             estimator_string,
             select,
             result_table,
             feature_columns,
             feature_column_names,
             feature_metas={},
             label_meta={},
             model_params={},
             validation_metrics=["Accuracy"],
             save="",
             batch_size=1,
             validation_steps=None,
             verbose=0,
             pai_table=""):
    FLAGS = define_tf_flags()
    set_oss_environs(FLAGS)

    estimator_cls = import_model(estimator_string)
    is_estimator = is_tf_estimator(estimator_cls)
    set_log_level(verbose, is_estimator)

    is_pai = True if pai_table else False
    eval_dataset = get_dataset_fn(select,
                                  datasource,
                                  feature_column_names,
                                  feature_metas,
                                  label_meta,
                                  is_pai=is_pai,
                                  pai_table=pai_table,
                                  batch_size=batch_size)

    model_params.update(feature_columns)
    pop_optimizer_and_loss(model_params)
    if is_estimator:
        with open("exported_path", "r") as fid:
            exported_path = str(fid.read())

        model_params["warm_start_from"] = exported_path
        estimator = estimator_cls(**model_params)
        result_metrics = estimator_evaluate(estimator, eval_dataset,
                                            validation_metrics)
    else:
        keras_model = init_model_with_feature_column(estimator_cls,
                                                     model_params)
        keras_model_pkg = sys.modules[estimator_cls.__module__]
        result_metrics = keras_evaluate(keras_model, eval_dataset, save,
                                        keras_model_pkg, validation_metrics)

    if result_table:
        metric_name_list = ["loss"] + validation_metrics
        if is_pai:
            conn = PaiIOConnection.from_table(result_table)
        else:
            conn = db.connect_with_data_source(datasource)
        write_result_metrics(result_metrics, metric_name_list, result_table,
                             conn)
        conn.close()
Exemple #4
0
def shap_explain(datasource,
                 select,
                 feature_field_meta,
                 feature_column_names,
                 label_meta,
                 summary_params,
                 result_table="",
                 is_pai=False,
                 pai_explain_table="",
                 oss_dest=None,
                 oss_ak=None,
                 oss_sk=None,
                 oss_endpoint=None,
                 oss_bucket_name=None,
                 transform_fn=None,
                 feature_column_code=""):
    x = xgb_shap_dataset(datasource,
                         select,
                         feature_column_names,
                         label_meta,
                         feature_field_meta,
                         is_pai,
                         pai_explain_table,
                         transform_fn=transform_fn,
                         feature_column_code=feature_column_code)
    shap_values, shap_interaction_values, expected_value = xgb_shap_values(x)
    if result_table != "":
        if is_pai:
            from runtime.dbapi.paiio import PaiIOConnection
            conn = PaiIOConnection.from_table(result_table)
        else:
            conn = db.connect_with_data_source(datasource)
        # TODO(typhoonzero): the shap_values is may be a
        # list of shape [3, num_samples, num_features],
        # use the first dimension here, should find out
        # when to use the other two. When shap_values is
        # not a list it can be directly used.
        if isinstance(shap_values, list):
            to_write = shap_values[0]
        else:
            to_write = shap_values
        write_shap_values(to_write, conn, result_table, feature_column_names)

    if summary_params.get("plot_type") == "decision":
        explainer.plot_and_save(
            lambda: shap.decision_plot(expected_value,
                                       shap_interaction_values,
                                       x,
                                       show=False,
                                       feature_display_range=slice(
                                           None, -40, -1),
                                       alpha=1), oss_dest, oss_ak, oss_sk,
            oss_endpoint, oss_bucket_name)
    else:
        explainer.plot_and_save(
            lambda: shap.summary_plot(
                shap_values, x, show=False, **summary_params), oss_dest,
            oss_ak, oss_sk, oss_endpoint, oss_bucket_name)
Exemple #5
0
def pred_imp(datasource,
             select,
             feature_metas,
             feature_column_names,
             train_label_meta,
             pred_label_meta,
             result_table,
             is_pai=False,
             pai_table="",
             model_params=None,
             train_params=None,
             transform_fn=None,
             feature_column_code="",
             rank=0,
             nworkers=1):
    print("rank={} nworkers={}".format(rank, nworkers))
    if not is_pai:
        conn = db.connect_with_data_source(datasource)
    else:
        conn = PaiIOConnection.from_table(pai_table)
    dpred = xgb_dataset(
        datasource=datasource,
        fn='predict.txt',
        dataset_sql=select,
        feature_metas=feature_metas,
        feature_column_names=feature_column_names,
        label_meta=None,
        is_pai=is_pai,
        pai_table=pai_table,
        pai_single_file=True,
        cache=True,
        batch_size=DEFAULT_PREDICT_BATCH_SIZE,
        rank=rank,
        nworkers=nworkers,
        transform_fn=transform_fn,
        feature_column_code=feature_column_code,
        raw_data_dir="predict.raw.dir")  # NOTE: default to use external memory
    bst = xgb.Booster({'nthread': 4})  # init model
    bst.load_model("my_model")  # load data
    print("{} Start predicting XGBoost model...".format(datetime.now()))
    if not model_params:
        model_params = load_metadata("model_meta.json")["attributes"]

    selected_cols = db.selected_cols(conn, select)

    feature_file_id = 0
    train_label_name = train_label_meta["feature_name"]
    pred_label_name = pred_label_meta["feature_name"]
    for pred_dmatrix in dpred:
        predict_and_store_result(bst, pred_dmatrix, feature_file_id,
                                 model_params, selected_cols, train_label_name,
                                 pred_label_name, feature_column_names,
                                 feature_metas, is_pai, conn, result_table,
                                 rank)
        feature_file_id += 1
    print("{} Done predicting. Predict table: {}".format(
        datetime.now(), result_table))
Exemple #6
0
def pred(datasource,
         estimator_string,
         select,
         result_table,
         feature_columns,
         feature_column_names,
         feature_column_names_map,
         train_label_name,
         result_col_name,
         feature_metas={},
         model_params={},
         pred_params={},
         save="",
         batch_size=1,
         pai_table=""):
    estimator = import_model(estimator_string)
    model_params.update(feature_columns)
    is_estimator = is_tf_estimator(estimator)

    if pai_table != "":
        conn = PaiIOConnection.from_table(pai_table)
        selected_cols = db.selected_cols(conn, None)
        predict_generator = db.db_generator(conn, None)
    else:
        conn = db.connect_with_data_source(datasource)
        selected_cols = db.selected_cols(conn, select)
        predict_generator = db.db_generator(conn, select)

    pop_optimizer_and_loss(model_params)

    if pred_params is None:
        extra_result_cols = []
    else:
        extra_result_cols = pred_params.get("extra_outputs", "")
        extra_result_cols = [
            c.strip() for c in extra_result_cols.split(",") if c.strip()
        ]

    if not is_estimator:
        if not issubclass(estimator, tf.keras.Model):
            # functional model need field_metas parameter
            model_params["field_metas"] = feature_metas
        print("Start predicting using keras model...")
        keras_predict(estimator, model_params, save, result_table,
                      feature_column_names, feature_metas, train_label_name,
                      result_col_name, conn, predict_generator, selected_cols,
                      extra_result_cols)
    else:
        # TODO(sneaxiy): support extra_result_cols for estimator
        model_params['model_dir'] = save
        print("Start predicting using estimator model...")
        estimator_predict(result_table, feature_column_names, feature_metas,
                          train_label_name, result_col_name, conn,
                          predict_generator, selected_cols)

    print("Done predicting. Predict table : %s" % result_table)
Exemple #7
0
def explain(datasource,
            select,
            feature_field_meta,
            feature_column_names,
            label_meta,
            summary_params,
            explainer="TreeExplainer",
            result_table="",
            is_pai=False,
            pai_explain_table="",
            oss_dest=None,
            oss_ak=None,
            oss_sk=None,
            oss_endpoint=None,
            oss_bucket_name=None,
            transform_fn=None,
            feature_column_code=""):
    if explainer == "XGBoostExplainer":
        if result_table == "":
            raise ValueError("""XGBoostExplainer must use with INTO to output
result to a table.""")
        bst = xgb.Booster()
        bst.load_model("my_model")
        gain_map = bst.get_score(importance_type="gain")
        fscore_map = bst.get_fscore()
        if is_pai:
            from runtime.dbapi.paiio import PaiIOConnection
            conn = PaiIOConnection.from_table(result_table)
        else:
            conn = db.connect_with_data_source(datasource)

        all_feature_keys = list(gain_map.keys())
        all_feature_keys.sort()
        with db.buffered_db_writer(conn, result_table,
                                   ["feature", "fscore", "gain"], 100) as w:
            for fkey in all_feature_keys:
                row = [fkey, fscore_map[fkey], gain_map[fkey]]
                w.write(list(row))
    else:
        # when explainer is "" or "TreeExplainer" use SHAP by default.
        shap_explain(datasource,
                     select,
                     feature_field_meta,
                     feature_column_names,
                     label_meta,
                     summary_params,
                     result_table=result_table,
                     is_pai=is_pai,
                     pai_explain_table=pai_explain_table,
                     oss_dest=oss_dest,
                     oss_ak=oss_ak,
                     oss_sk=oss_sk,
                     oss_endpoint=oss_endpoint,
                     oss_bucket_name=oss_bucket_name,
                     transform_fn=transform_fn,
                     feature_column_code=feature_column_code)
Exemple #8
0
def pred(datasource,
         select,
         feature_metas,
         feature_column_names,
         train_label_meta,
         pred_label_meta,
         result_table,
         is_pai=False,
         hdfs_namenode_addr="",
         hive_location="",
         hdfs_user="",
         hdfs_pass="",
         pai_table="",
         model_params=None,
         train_params=None,
         transform_fn=None,
         feature_column_code=""):
    if not is_pai:
        conn = db.connect_with_data_source(datasource)
    else:
        conn = PaiIOConnection.from_table(pai_table)
    dpred = xgb_dataset(
        datasource=datasource,
        fn='predict.txt',
        dataset_sql=select,
        feature_metas=feature_metas,
        feature_column_names=feature_column_names,
        label_meta=None,
        is_pai=is_pai,
        pai_table=pai_table,
        pai_single_file=True,
        cache=True,
        batch_size=DEFAULT_PREDICT_BATCH_SIZE,
        transform_fn=transform_fn,
        feature_column_code=feature_column_code,
        raw_data_dir="predict.raw.dir")  # NOTE: default to use external memory
    bst = xgb.Booster({'nthread': 4})  # init model
    bst.load_model("my_model")  # load data
    print("Start predicting XGBoost model...")

    selected_cols = db.selected_cols(conn, select)

    feature_file_id = 0
    train_label_name = train_label_meta["feature_name"]
    pred_label_name = pred_label_meta["feature_name"]
    for pred_dmatrix in dpred:
        predict_and_store_result(bst, pred_dmatrix, feature_file_id,
                                 model_params, selected_cols, train_label_name,
                                 pred_label_name, feature_column_names,
                                 feature_metas, is_pai, conn, result_table,
                                 hdfs_namenode_addr, hive_location, hdfs_user,
                                 hdfs_pass)
        feature_file_id += 1
    print("Done predicting. Predict table : %s" % result_table)
Exemple #9
0
def _explain(datasource,
             estimator_string,
             select,
             feature_columns,
             feature_column_names,
             feature_metas={},
             label_meta={},
             model_params={},
             save="",
             pai_table="",
             plot_type='bar',
             result_table="",
             oss_dest=None,
             oss_ak=None,
             oss_sk=None,
             oss_endpoint=None,
             oss_bucket_name=None):
    estimator_cls = import_model(estimator_string)
    FLAGS = tf.app.flags.FLAGS
    model_params["model_dir"] = FLAGS.checkpointDir
    model_params.update(feature_columns)

    def _input_fn():
        dataset = input_fn("",
                           datasource,
                           feature_column_names,
                           feature_metas,
                           label_meta,
                           is_pai=True,
                           pai_table=pai_table)
        return dataset.batch(1).cache()

    estimator = init_model_with_feature_column(estimator_cls, model_params)
    driver = "paiio"
    conn = PaiIOConnection.from_table(result_table) if result_table else None
    if estimator_cls in (tf.estimator.BoostedTreesClassifier,
                         tf.estimator.BoostedTreesRegressor):
        explain_boosted_trees(datasource, estimator, _input_fn, plot_type,
                              result_table, feature_column_names, driver, conn,
                              "", "", "", "", oss_dest, oss_ak, oss_sk,
                              oss_endpoint, oss_bucket_name)
    else:
        shap_dataset = pd.DataFrame(columns=feature_column_names)
        for i, (features, label) in enumerate(_input_fn()):
            shap_dataset.loc[i] = [
                item.numpy()[0][0] for item in features.values()
            ]
        explain_dnns(datasource, estimator, shap_dataset, plot_type,
                     result_table, feature_column_names, driver, conn, "", "",
                     "", "", oss_dest, oss_ak, oss_sk, oss_endpoint,
                     oss_bucket_name)
Exemple #10
0
def get_pai_table_slice_count(table, nworkers, batch_size):
    if batch_size is None or batch_size <= 0:
        batch_size = 4096  # default batch_size

    row_cnt = PaiIOConnection.from_table(table).get_table_row_num()

    assert row_cnt >= nworkers, "Data number {} should not " \
                                "less than worker number {}"\
        .format(row_cnt, nworkers)

    slice_num_per_worker = max(int(row_cnt / (nworkers * batch_size)), 1)
    slice_count = slice_num_per_worker * nworkers

    print('row_cnt = {}, slice_count = {}, nworkers = {}'.format(
        row_cnt, slice_count, nworkers))

    return slice_count
Exemple #11
0
def _evaluate(datasource,
              estimator_string,
              select,
              result_table,
              feature_columns,
              feature_column_names,
              feature_metas={},
              label_meta={},
              model_params={},
              validation_metrics=["Accuracy"],
              save="",
              batch_size=1,
              validation_steps=None,
              verbose=0,
              pai_table=""):
    estimator_cls = import_model(estimator_string)
    is_estimator = is_tf_estimator(estimator_cls)
    set_log_level(verbose, is_estimator)
    eval_dataset = get_dataset_fn(select,
                                  datasource,
                                  feature_column_names,
                                  feature_metas,
                                  label_meta,
                                  is_pai=True,
                                  pai_table=pai_table,
                                  batch_size=batch_size)

    model_params.update(feature_columns)
    pop_optimizer_and_loss(model_params)
    if is_estimator:
        FLAGS = tf.app.flags.FLAGS
        model_params["model_dir"] = FLAGS.checkpointDir
        estimator = estimator_cls(**model_params)
        result_metrics = estimator_evaluate(estimator, eval_dataset,
                                            validation_metrics)
    else:
        keras_model = init_model_with_feature_column(estimator, model_params)
        keras_model_pkg = sys.modules[estimator_cls.__module__]
        result_metrics = keras_evaluate(keras_model, eval_dataset, save,
                                        keras_model_pkg, validation_metrics)

    if result_table:
        metric_name_list = ["loss"] + validation_metrics
        write_result_metrics(result_metrics, metric_name_list, result_table,
                             PaiIOConnection.from_table(result_table))
Exemple #12
0
def evaluate(datasource,
             select,
             feature_metas,
             feature_column_names,
             label_meta,
             result_table,
             validation_metrics=["accuracy_score"],
             is_pai=False,
             pai_table="",
             model_params=None,
             transform_fn=None,
             feature_column_code=""):
    if not is_pai:
        conn = db.connect_with_data_source(datasource)
    else:
        conn = PaiIOConnection.from_table(pai_table)
    dpred = xgb_dataset(datasource,
                        'predict.txt',
                        select,
                        feature_metas,
                        feature_column_names,
                        label_meta,
                        is_pai,
                        pai_table,
                        True,
                        True,
                        batch_size=DEFAULT_PREDICT_BATCH_SIZE,
                        transform_fn=transform_fn,
                        feature_column_code=feature_column_code
                        )  # NOTE: default to use external memory
    bst = xgb.Booster({'nthread': 4})  # init model
    bst.load_model("my_model")  # load model
    if not model_params:
        model_params = load_metadata("model_meta.json")["attributes"]
    print("Start evaluating XGBoost model...")
    feature_file_id = 0
    for pred_dmatrix in dpred:
        evaluate_and_store_result(bst, pred_dmatrix, feature_file_id,
                                  validation_metrics, model_params,
                                  feature_column_names, label_meta, is_pai,
                                  conn, result_table)
        feature_file_id += 1
    print("Done evaluating. Result table : %s" % result_table)
Exemple #13
0
def pai_download_table_data_worker(dname, feature_metas, feature_column_names,
                                   label_meta, pai_table, slice_id,
                                   slice_count, feature_column_code,
                                   raw_data_dir):
    import runtime.xgboost as xgboost_extended
    feature_column_transformers = eval('[{}]'.format(feature_column_code))
    transform_fn = xgboost_extended.feature_column.ComposedColumnTransformer(
        feature_column_names, *feature_column_transformers)

    conn = PaiIOConnection.from_table(pai_table, slice_id, slice_count)
    gen = db.db_generator(conn, None, label_meta=label_meta)()
    selected_cols = db.selected_cols(conn, None)
    filename = "{}/{}.txt".format(dname, slice_id)
    dump_dmatrix(filename,
                 gen,
                 feature_column_names,
                 feature_metas,
                 label_meta,
                 selected_cols,
                 transform_fn=transform_fn,
                 raw_data_dir=raw_data_dir)
Exemple #14
0
def _predict(datasource,
             estimator_string,
             select,
             result_table,
             feature_columns,
             feature_column_names,
             feature_column_names_map,
             train_label_name,
             result_col_name,
             feature_metas={},
             model_params={},
             save="",
             batch_size=1,
             pai_table=""):
    estimator = import_model(estimator_string)
    model_params.update(feature_columns)
    is_estimator = is_tf_estimator(estimator)

    conn = PaiIOConnection.from_table(pai_table)
    selected_cols = db.selected_cols(conn, None)
    predict_generator = db.db_generator(conn, None)

    pop_optimizer_and_loss(model_params)

    if not is_estimator:
        if not issubclass(estimator, tf.keras.Model):
            # functional model need field_metas parameter
            model_params["field_metas"] = feature_metas
        print("Start predicting using keras model...")
        keras_predict(estimator, model_params, save, result_table,
                      feature_column_names, feature_metas, train_label_name,
                      result_col_name, conn, predict_generator, selected_cols)
    else:
        model_params['model_dir'] = save
        print("Start predicting using estimator model...")
        estimator_predict(result_table, feature_column_names, feature_metas,
                          train_label_name, result_col_name, conn,
                          predict_generator, selected_cols)

    print("Done predicting. Predict table : %s" % result_table)
Exemple #15
0
def xgb_shap_dataset(datasource,
                     select,
                     feature_column_names,
                     label_meta,
                     feature_metas,
                     is_pai,
                     pai_explain_table,
                     transform_fn=None,
                     feature_column_code=""):
    if is_pai:
        # (TODO: lhw) we may specify pai_explain_table in datasoure
        # and discard the condition statement here
        conn = PaiIOConnection.from_table(pai_explain_table)
        stream = db.db_generator(conn, None, label_meta)
    else:
        conn = db.connect_with_data_source(datasource)
        stream = db.db_generator(conn, select, label_meta)
    selected_cols = db.selected_cols(conn, select)

    if transform_fn:
        feature_names = transform_fn.get_feature_column_names()
    else:
        feature_names = feature_column_names

    xs = None
    dtypes = []
    sizes = []
    offsets = []

    i = 0
    for row, label in stream():
        features = db.read_features_from_row(row,
                                             selected_cols,
                                             feature_column_names,
                                             feature_metas,
                                             is_xgboost=True)
        if transform_fn:
            features = transform_fn(features)

        flatten_features = []
        for j, feature in enumerate(features):
            if len(feature) == 3:  # convert sparse to dense
                col_indices, values, dense_shape = feature
                size = int(np.prod(dense_shape))
                row_indices = np.zeros(shape=[col_indices.size])
                sparse_matrix = scipy.sparse.csr_matrix(
                    (values, (row_indices, col_indices)), shape=[1, size])
                values = sparse_matrix.toarray()
            else:
                values = feature[0]

            if isinstance(values, np.ndarray):
                flatten_features.extend(values.flatten().tolist())
                if i == 0:
                    sizes.append(values.size)
                    dtypes.append(infer_dtype(values))
            else:
                flatten_features.append(values)
                if i == 0:
                    sizes.append(1)
                    dtypes.append(infer_dtype(values))

        # Create the column name according to the feature number
        # of each column.
        #
        # If the column "c" contains only 1 feature, the result
        # column name would be "c" too.
        #
        # If the column "c" contains 3 features,
        # the result column name would be "c_0", "c_1" and "c_2"
        if i == 0:
            offsets = np.cumsum([0] + sizes)
            column_names = []
            for j in six.moves.range(len(offsets) - 1):
                start = offsets[j]
                end = offsets[j + 1]
                if end - start == 1:
                    column_names.append(feature_names[j])
                else:
                    for k in six.moves.range(start, end):
                        column_names.append('{}_{}'.format(
                            feature_names[j], k))

            xs = pd.DataFrame(columns=column_names)

        xs.loc[i] = flatten_features

        i += 1
    # NOTE(typhoonzero): set dtype to the feature's actual type, or the dtype
    # may be "object". Use below code to reproduce:
    # import pandas as pd
    # feature_column_names=["a", "b"]
    # xs = pd.DataFrame(columns=feature_column_names)
    # for i in range(10):
    #     xs.loc[i] = [int(j) for j in range(2)]
    # print(xs.dtypes)
    columns = xs.columns
    for i, dtype in enumerate(dtypes):
        for j in six.moves.range(offsets[i], offsets[i + 1]):
            xs[columns[j]] = xs[columns[j]].astype(dtype)

    return xs
Exemple #16
0
def predict_step(datasource,
                 select,
                 result_table,
                 label_name,
                 model,
                 pai_table=None):
    if isinstance(model, six.string_types):
        model = Model.load_from_db(datasource, model)
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)

    model_params = model.get_meta("attributes")
    train_fc_map = model.get_meta("features")
    label_meta = model.get_meta("label")
    train_label_desc = label_meta.get_field_desc()[0] if label_meta else None
    train_label_name = train_label_desc.name if train_label_desc else None
    estimator_string = model.get_meta("class_name")
    save = "model_save"

    field_descs = get_ordered_field_descs(train_fc_map)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])
    feature_columns = compile_ir_feature_columns(train_fc_map,
                                                 model.get_type())

    is_pai = True if pai_table else False
    if is_pai:
        select = "SELECT * FROM %s" % pai_table

    conn = db.connect_with_data_source(datasource)
    result_column_names, train_label_idx = create_predict_table(
        conn, select, result_table, train_label_desc, label_name)

    if is_pai:
        conn.close()
        conn = PaiIOConnection.from_table(pai_table)
        select = None

    selected_cols = result_column_names[0:-1]
    if train_label_idx >= 0:
        selected_cols = selected_cols[0:train_label_idx] + [
            train_label_name
        ] + selected_cols[train_label_idx:]

    estimator = import_model(estimator_string)
    model_params.update(feature_columns)
    is_estimator = is_tf_estimator(estimator)
    predict_generator = db.db_generator(conn, select)

    pop_optimizer_and_loss(model_params)
    if not is_estimator:
        if not issubclass(estimator, tf.keras.Model):
            # functional model need field_metas parameter
            model_params["field_metas"] = feature_metas
        print("Start predicting using keras model...")
        keras_predict(estimator, model_params, save, result_table,
                      feature_column_names, feature_metas, train_label_name,
                      label_name, conn, predict_generator, selected_cols)
    else:
        model_params['model_dir'] = save
        print("Start predicting using estimator model...")
        estimator_predict(result_table, feature_column_names, feature_metas,
                          train_label_name, label_name, conn,
                          predict_generator, selected_cols)

    print("Done predicting. Predict table : %s" % result_table)
    conn.close()
Exemple #17
0
def predict(datasource,
            select,
            result_table,
            result_column_names,
            train_label_idx,
            model,
            extra_result_cols=[],
            pai_table=None):
    """TBD
    """
    bst = xgb.Booster()
    if isinstance(model, six.string_types):
        # NOTE(typhoonzero): must run Model.load_from_db in a temp
        # directory, calling pyodps in current directory on PAI
        # workers will cause paiio fails.
        with temp_file.TemporaryDirectory(as_cwd=True):
            model = Model.load_from_db(datasource, model)
            bst.load_model("my_model")
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)
        bst.load_model("my_model")

    model_params = model.get_meta("attributes")
    fc_map_ir = model.get_meta("features")
    feature_columns = compile_ir_feature_columns(fc_map_ir,
                                                 EstimatorType.XGBOOST)
    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])

    transform_fn = ComposedColumnTransformer(
        feature_column_names, *feature_columns["feature_columns"])

    is_pai = True if pai_table else False
    if is_pai:
        conn = PaiIOConnection.from_table(pai_table)
    else:
        conn = db.connect_with_data_source(datasource)

    with temp_file.TemporaryDirectory() as tmp_dir_name:
        pred_fn = os.path.join(tmp_dir_name, "predict.txt")
        raw_data_dir = os.path.join(tmp_dir_name, "predict_raw_dir")

        dpred = xgb_dataset(datasource=datasource,
                            fn=pred_fn,
                            dataset_sql=select,
                            feature_metas=feature_metas,
                            feature_column_names=feature_column_names,
                            label_meta=None,
                            cache=True,
                            batch_size=10000,
                            transform_fn=transform_fn,
                            raw_data_dir=raw_data_dir,
                            is_pai=is_pai,
                            pai_table=pai_table,
                            pai_single_file=True,
                            feature_column_code=fc_map_ir)

        print("Start predicting XGBoost model...")
        for idx, pred_dmatrix in enumerate(dpred):
            if is_pai:
                feature_file_name = os.path.join(tmp_dir_name,
                                                 "predict.txt.raw")
            else:
                feature_file_name = os.path.join(
                    tmp_dir_name, "predict_raw_dir/predict.txt_%d" % idx)
            preds = _calc_predict_result(bst, pred_dmatrix, model_params)
            _store_predict_result(preds, result_table, result_column_names,
                                  train_label_idx, feature_file_name, conn)
        print("Done predicting. Predict table : %s" % result_table)

    conn.close()
Exemple #18
0
def explain(datasource,
            estimator_string,
            select,
            feature_columns,
            feature_column_names,
            feature_metas={},
            label_meta={},
            model_params={},
            save="",
            pai_table="",
            plot_type='bar',
            result_table="",
            oss_dest=None,
            oss_ak=None,
            oss_sk=None,
            oss_endpoint=None,
            oss_bucket_name=None):
    estimator_cls = import_model(estimator_string)
    is_pai = True if pai_table else False
    if is_pai:
        FLAGS = tf.app.flags.FLAGS
        model_params["model_dir"] = FLAGS.checkpointDir
        select = ""
    else:
        if is_tf_estimator(estimator_cls):
            model_params['model_dir'] = save
    model_params.update(feature_columns)
    pop_optimizer_and_loss(model_params)

    def _input_fn():
        dataset = input_fn(select,
                           datasource,
                           feature_column_names,
                           feature_metas,
                           label_meta,
                           is_pai=is_pai,
                           pai_table=pai_table)
        return dataset.batch(1).cache()

    estimator = init_model_with_feature_column(estimator_cls, model_params)
    if not is_tf_estimator(estimator_cls):
        load_keras_model_weights(estimator, save)

    if is_pai:
        conn = PaiIOConnection.from_table(
            result_table) if result_table else None
    else:
        conn = connect_with_data_source(datasource)

    if estimator_cls in (tf.estimator.BoostedTreesClassifier,
                         tf.estimator.BoostedTreesRegressor):
        explain_boosted_trees(datasource, estimator, _input_fn, plot_type,
                              result_table, feature_column_names, conn,
                              oss_dest, oss_ak, oss_sk, oss_endpoint,
                              oss_bucket_name)
    else:
        shap_dataset = pd.DataFrame(columns=feature_column_names)
        for i, (features, label) in enumerate(_input_fn()):
            shap_dataset.loc[i] = [
                item.numpy()[0][0] for item in features.values()
            ]
        explain_dnns(datasource, estimator, shap_dataset, plot_type,
                     result_table, feature_column_names, conn, oss_dest,
                     oss_ak, oss_sk, oss_endpoint, oss_bucket_name)

    if conn is not None:
        conn.close()
Exemple #19
0
def explain(datasource,
            select,
            feature_field_meta,
            feature_column_names,
            label_meta,
            summary_params,
            result_table="",
            is_pai=False,
            pai_explain_table="",
            hdfs_namenode_addr="",
            hive_location="",
            hdfs_user="",
            hdfs_pass="",
            oss_dest=None,
            oss_ak=None,
            oss_sk=None,
            oss_endpoint=None,
            oss_bucket_name=None,
            transform_fn=None,
            feature_column_code=""):
    x = xgb_shap_dataset(datasource,
                         select,
                         feature_column_names,
                         label_meta,
                         feature_field_meta,
                         is_pai,
                         pai_explain_table,
                         transform_fn=transform_fn,
                         feature_column_code=feature_column_code)

    shap_values, shap_interaction_values, expected_value = xgb_shap_values(x)

    if result_table != "":
        if is_pai:
            from runtime.dbapi.paiio import PaiIOConnection
            conn = PaiIOConnection.from_table(result_table)
            # TODO(typhoonzero): the shape of shap_values is
            # (3, num_samples, num_features), use the first
            # dimension here, should find out how to use
            # the other two.
        else:
            conn = db.connect_with_data_source(datasource)

        write_shap_values(shap_values[0], conn, result_table,
                          feature_column_names)
        return

    if summary_params.get("plot_type") == "decision":
        explainer.plot_and_save(
            lambda: shap.decision_plot(expected_value,
                                       shap_interaction_values,
                                       x,
                                       show=False,
                                       feature_display_range=slice(
                                           None, -40, -1),
                                       alpha=1), oss_dest, oss_ak, oss_sk,
            oss_endpoint, oss_bucket_name)
    else:
        explainer.plot_and_save(
            lambda: shap.summary_plot(
                shap_values, x, show=False, **summary_params), oss_dest,
            oss_ak, oss_sk, oss_endpoint, oss_bucket_name)
Exemple #20
0
def evaluate(datasource,
             select,
             result_table,
             model,
             label_name=None,
             model_params=None,
             result_column_names=[],
             pai_table=None):
    """TBD
    """
    if model_params is None:
        model_params = {}
    validation_metrics = model_params.get("validation.metrics",
                                          "accuracy_score")
    validation_metrics = [m.strip() for m in validation_metrics.split(",")]

    bst = xgb.Booster()
    if isinstance(model, six.string_types):
        with temp_file.TemporaryDirectory(as_cwd=True):
            model = Model.load_from_db(datasource, model)
            bst.load_model("my_model")
    else:
        assert isinstance(model,
                          Model), "not supported model type %s" % type(model)
        bst.load_model("my_model")

    model_params = model.get_meta("attributes")
    fc_map_ir = model.get_meta("features")
    train_label = model.get_meta("label")
    train_label_desc = train_label.get_field_desc()[0]

    if label_name:
        train_label_desc.name = label_name

    feature_columns = compile_ir_feature_columns(fc_map_ir,
                                                 EstimatorType.XGBOOST)
    field_descs = get_ordered_field_descs(fc_map_ir)
    feature_column_names = [fd.name for fd in field_descs]
    feature_metas = dict([(fd.name, fd.to_dict(dtype_to_string=True))
                          for fd in field_descs])
    transform_fn = ComposedColumnTransformer(
        feature_column_names, *feature_columns["feature_columns"])

    is_pai = True if pai_table else False
    if is_pai:
        conn = PaiIOConnection.from_table(pai_table)
    else:
        conn = db.connect_with_data_source(datasource)

    with temp_file.TemporaryDirectory() as tmp_dir_name:
        pred_fn = os.path.join(tmp_dir_name, "predict.txt")

        dpred = xgb_dataset(
            datasource=datasource,
            fn=pred_fn,
            dataset_sql=select,
            feature_metas=feature_metas,
            feature_column_names=feature_column_names,
            label_meta=train_label_desc.to_dict(dtype_to_string=True),
            cache=True,
            batch_size=10000,
            transform_fn=transform_fn,
            is_pai=is_pai,
            pai_table=pai_table,
            pai_single_file=True,
            feature_column_code=fc_map_ir)

        for i, pred_dmatrix in enumerate(dpred):
            if is_pai:
                feature_file_name = pred_fn
            else:
                feature_file_name = pred_fn + "_%d" % i
            preds = _calc_predict_result(bst, pred_dmatrix, model_params)
            _store_evaluate_result(preds, feature_file_name, train_label_desc,
                                   result_table, result_column_names,
                                   validation_metrics, conn)

    conn.close()