Ejemplo n.º 1
0
def explain_dnns(datasource, estimator, shap_dataset, plot_type, result_table,
                 feature_column_names, is_pai, pai_table, hdfs_namenode_addr,
                 hive_location, hdfs_user, hdfs_pass):
    def predict(d):
        def input_fn():
            return tf.data.Dataset.from_tensor_slices(
                dict(pd.DataFrame(d, columns=shap_dataset.columns))).batch(1)

        return np.array(
            [p['probabilities'][0] for p in estimator.predict(input_fn)])

    shap_values = shap.KernelExplainer(predict,
                                       shap_dataset).shap_values(shap_dataset)
    print(shap_values)
    for row in shap_values:
        print(list(row))
        print(len(list(row)))
    if result_table != "":
        if is_pai:
            write_shap_values(shap_values, "pai_maxcompute", None,
                              result_table, feature_column_names,
                              hdfs_namenode_addr, hive_location, hdfs_user,
                              hdfs_pass)
        else:
            conn = connect_with_data_source(datasource)
            write_shap_values(shap_values, conn.driver, conn, result_table,
                              feature_column_names, hdfs_namenode_addr,
                              hive_location, hdfs_user, hdfs_pass)
    else:
        explainer.plot_and_save(lambda: shap.summary_plot(
            shap_values, shap_dataset, show=False, plot_type=plot_type))
Ejemplo n.º 2
0
def explain_dnns(datasource, estimator, shap_dataset, plot_type, result_table,
                 feature_column_names, is_pai, pai_table, hdfs_namenode_addr,
                 hive_location, hdfs_user, hdfs_pass):
    def predict(d):
        def input_fn():
            return tf.data.Dataset.from_tensor_slices(
                dict(pd.DataFrame(d,
                                  columns=shap_dataset.columns))).batch(1000)

        return np.array(
            [p['probabilities'][-1] for p in estimator.predict(input_fn)])

    if len(shap_dataset) > 100:
        # Reduce to 16 weighted samples to speed up
        shap_dataset_summary = shap.kmeans(shap_dataset, 16)
    else:
        shap_dataset_summary = shap_dataset
    shap_values = shap.KernelExplainer(
        predict, shap_dataset_summary).shap_values(shap_dataset, l1_reg="aic")
    if result_table != "":
        if is_pai:
            write_shap_values(shap_values, "pai_maxcompute", None,
                              result_table, feature_column_names,
                              hdfs_namenode_addr, hive_location, hdfs_user,
                              hdfs_pass)
        else:
            conn = connect_with_data_source(datasource)
            write_shap_values(shap_values, conn.driver, conn, result_table,
                              feature_column_names, hdfs_namenode_addr,
                              hive_location, hdfs_user, hdfs_pass)
    else:
        explainer.plot_and_save(lambda: shap.summary_plot(
            shap_values, shap_dataset, show=False, plot_type=plot_type))
Ejemplo n.º 3
0
def explain(datasource, select, feature_field_meta, label_name,
            summary_params):
    feature_column_names = [k["name"] for k in feature_field_meta]
    feature_specs = {k['name']: k for k in feature_field_meta}
    x = xgb_shap_dataset(datasource, select, feature_column_names, label_name,
                         feature_specs)

    shap_values = xgb_shap_values(x)

    # save summary.png using the default backend
    explainer.plot_and_save(lambda: shap.summary_plot(
        shap_values, x, show=False, **summary_params))
Ejemplo n.º 4
0
def explain(datasource,
            select,
            feature_field_meta,
            feature_column_names,
            label_spec,
            summary_params,
            result_table="",
            is_pai=False,
            pai_explain_table="",
            hdfs_namenode_addr="",
            hive_location="",
            hdfs_user="",
            hdfs_pass="",
            oss_dest=None,
            oss_ak=None,
            oss_sk=None,
            oss_endpoint=None,
            oss_bucket_name=None):
    x = xgb_shap_dataset(datasource, select, feature_column_names, label_spec,
                         feature_field_meta, is_pai, pai_explain_table)

    shap_values, shap_interaction_values, expected_value = xgb_shap_values(x)

    if result_table != "":
        if is_pai:
            # TODO(typhoonzero): the shape of shap_values is (3, num_samples, num_features)
            # use the first dimension here, should find out how to use the other two.
            write_shap_values(shap_values[0], "pai_maxcompute", None,
                              result_table, feature_column_names,
                              hdfs_namenode_addr, hive_location, hdfs_user,
                              hdfs_pass)
        else:
            conn = connect_with_data_source(datasource)
            write_shap_values(shap_values[0], conn.driver, conn, result_table,
                              feature_column_names, hdfs_namenode_addr,
                              hive_location, hdfs_user, hdfs_pass)
        return

    if summary_params.get("plot_type") == "decision":
        explainer.plot_and_save(
            lambda: shap.decision_plot(expected_value,
                                       shap_interaction_values,
                                       x,
                                       show=False,
                                       feature_display_range=slice(
                                           None, -40, -1),
                                       alpha=1), is_pai, oss_dest, oss_ak,
            oss_sk, oss_endpoint, oss_bucket_name)
    else:
        explainer.plot_and_save(
            lambda: shap.summary_plot(
                shap_values, x, show=False, **summary_params), is_pai,
            oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name)
Ejemplo n.º 5
0
def explain_dnns(datasource, estimator, shap_dataset, plot_type, result_table,
                 feature_column_names, is_pai, pai_table, hdfs_namenode_addr,
                 hive_location, hdfs_user, hdfs_pass, oss_dest, oss_ak, oss_sk,
                 oss_endpoint, oss_bucket_name):
    def predict(d):
        if len(d) == 1:
            # This is to make sure the progress bar of SHAP display properly:
            # 1. The newline makes the progress bar string captured in pipe
            # 2. The ASCII control code moves cursor up twice for alignment
            print("\033[A" * 2)

        def input_fn():
            return tf.data.Dataset.from_tensor_slices(
                dict(pd.DataFrame(d,
                                  columns=shap_dataset.columns))).batch(1000)

        if plot_type == 'bar':
            predictions = [
                p['logits'] if 'logits' in p else p['predictions']
                for p in estimator.predict(input_fn)
            ]
        else:
            predictions = [
                p['logits'][-1] if 'logits' in p else p['predictions'][-1]
                for p in estimator.predict(input_fn)
            ]
        return np.array(predictions)

    if len(shap_dataset) > 100:
        # Reduce to 16 weighted samples to speed up
        shap_dataset_summary = shap.kmeans(shap_dataset, 16)
    else:
        shap_dataset_summary = shap_dataset
    shap_values = shap.KernelExplainer(
        predict, shap_dataset_summary).shap_values(shap_dataset, l1_reg="aic")
    if result_table != "":
        if is_pai:
            write_shap_values(shap_values, "pai_maxcompute", None,
                              result_table, feature_column_names,
                              hdfs_namenode_addr, hive_location, hdfs_user,
                              hdfs_pass)
        else:
            conn = connect_with_data_source(datasource)
            write_shap_values(shap_values, conn.driver, conn, result_table,
                              feature_column_names, hdfs_namenode_addr,
                              hive_location, hdfs_user, hdfs_pass)
    explainer.plot_and_save(
        lambda: shap.summary_plot(
            shap_values, shap_dataset, show=False, plot_type=plot_type),
        is_pai, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name)
Ejemplo n.º 6
0
def explain_dnns(datasource, estimator, shap_dataset, plot_type, result_table,
                 feature_column_names, hdfs_namenode_addr, hive_location,
                 hdfs_user, hdfs_pass):
    def predict(d):
        def input_fn():
            return tf.data.Dataset.from_tensor_slices(
                dict(pd.DataFrame(d, columns=shap_dataset.columns))).batch(1)

        return np.array(
            [p['probabilities'][0] for p in estimator.predict(input_fn)])

    shap_values = shap.KernelExplainer(predict,
                                       shap_dataset).shap_values(shap_dataset)
    explainer.plot_and_save(lambda: shap.summary_plot(
        shap_values, shap_dataset, show=False, plot_type=plot_type))
Ejemplo n.º 7
0
def explain_boosted_trees(datasource, estimator, input_fn, plot_type,
                          result_table, feature_column_names,
                          hdfs_namenode_addr, hive_location, hdfs_user,
                          hdfs_pass):
    result = estimator.experimental_predict_with_explanations(input_fn)
    pred_dicts = list(result)
    df_dfc = pd.DataFrame([pred['dfc'] for pred in pred_dicts])
    dfc_mean = df_dfc.abs().mean()
    if result_table != "":
        conn = connect_with_data_source(datasource)
        gain = estimator.experimental_feature_importances(normalize=True)
        create_explain_result_table(conn, result_table)
        write_dfc_result(dfc_mean, gain, result_table, conn,
                         feature_column_names, hdfs_namenode_addr,
                         hive_location, hdfs_user, hdfs_pass)
    explainer.plot_and_save(lambda: eval(plot_type)(df_dfc))
Ejemplo n.º 8
0
def explain(datasource, select, feature_field_meta, label_spec,
            summary_params):
    feature_column_names = [k["name"] for k in feature_field_meta]
    feature_specs = {k['name']: k for k in feature_field_meta}
    x = xgb_shap_dataset(datasource, select, feature_column_names, label_spec,
                         feature_specs)

    shap_values, shap_interaction_values, expected_value = xgb_shap_values(x)

    if summary_params.get("plot_type") == "decision":
        explainer.plot_and_save(lambda: shap.decision_plot(
            expected_value,
            shap_interaction_values,
            x,
            show=False,
            feature_display_range=slice(None, -40, -1),
            alpha=1))
    else:
        explainer.plot_and_save(lambda: shap.summary_plot(
            shap_values, x, show=False, **summary_params))
Ejemplo n.º 9
0
def explain_boosted_trees(datasource, estimator, input_fn, plot_type,
                          result_table, feature_column_names, is_pai,
                          pai_table, hdfs_namenode_addr, hive_location,
                          hdfs_user, hdfs_pass, oss_dest, oss_ak, oss_sk,
                          oss_endpoint, oss_bucket_name):
    result = estimator.experimental_predict_with_explanations(input_fn)
    pred_dicts = list(result)
    df_dfc = pd.DataFrame([pred['dfc'] for pred in pred_dicts])
    dfc_mean = df_dfc.abs().mean()
    gain = estimator.experimental_feature_importances(normalize=True)
    if result_table != "":
        if is_pai:
            write_dfc_result(dfc_mean, gain, result_table, "pai_maxcompute",
                             None, feature_column_names, hdfs_namenode_addr,
                             hive_location, hdfs_user, hdfs_pass)
        else:
            conn = connect_with_data_source(datasource)
            write_dfc_result(dfc_mean, gain, result_table, conn.driver, conn,
                             feature_column_names, hdfs_namenode_addr,
                             hive_location, hdfs_user, hdfs_pass)
    explainer.plot_and_save(lambda: eval(plot_type)(df_dfc), is_pai, oss_dest,
                            oss_ak, oss_sk, oss_endpoint, oss_bucket_name)