Esempio n. 1
0
def display_shapley(X_train, y_train, cf=["dcor"], xlabels=None):
    if not isinstance(cf, list):
        cf = list(cf)

    d = X_train.shape[1]

    x_range = list(range(d))

    _, ax = plt.subplots()
    ax = nice_axes(ax)

    for _n, _cf in enumerate(cf):
        print(_cf)
        _shapley_values = shapley.calc_shapley_values(X_train, y_train, _cf)
        #_shapley_values = normalise(_shapley_values)

        plt.bar(x_range + 0.1 * _n * numpy.ones(d),
                _shapley_values,
                alpha=0.5,
                label=_cf,
                width=0.1)

        #if _n < 1:
        #_shapley_values, xlabels = [list(t) for t in zip(*sorted(zip(_shapley_values, xlabels), reverse=True))]

    if xlabels is None:
        x_labels = x_range

    ax.set_xticks(x_range)
    ax.set_xticklabels(xlabels, rotation=90)
    plt.title(r"Shapley decomposition of {0} on training data".format(cf))
    plt.legend()
    plt.draw()
Esempio n. 2
0
def display_shapley_vs_xgb(X_test, y_test, y_pred, cf="dcor"):
    d = X_test.shape[1]
    shapley_values_actual = shapley.calc_shapley_values(X_test, y_test, cf)
    shapley_values_xgb = shapley.calc_shapley_values(X_test, y_pred, cf)

    _, ax = plt.subplots()
    ax = nice_axes(ax)

    plt.title(
        r"Shapley decomposition of {0} on $X$ with true vs predicted $Y$".
        format(cf))
    plt.bar(range(len(shapley_values_actual)),
            shapley_values_actual,
            color="red",
            alpha=0.5,
            label="True")
    plt.bar(range(len(shapley_values_xgb)),
            shapley_values_xgb,
            color="blue",
            alpha=0.5,
            label="Predicted")
    plt.legend()
    plt.draw()
Esempio n. 3
0
def display_residuals_shapley(x, residuals, cf="dcor"):
    d = x.shape[1]
    shapley_values_residuals = shapley.calc_shapley_values(x, residuals, cf)

    _, ax = plt.subplots()
    ax = shapley.nice_axes(ax)

    plt.bar(range(len(shapley_values_residuals)),
            shapley_values_residuals,
            color="red",
            alpha=0.5)

    plt.title(
        r"Shapley decomposition of {0} on $X$ with the residuals (true-predicted $Y$)"
        .format(cf))

    plt.draw()
Esempio n. 4
0
sigma = 0.2
X = numpy.array([numpy.random.uniform(-1, 1, N) for _ in range(D)]).T
X[:, 1] = X[:, 0] + numpy.random.normal(0, sigma, N)
X[:, 2] = X[:, 0] + numpy.random.normal(0, sigma, N)
Y = numpy.matmul(numpy.multiply(X, X), numpy.ones(D))
# ---

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=7)

# --- Fit model
model = XGBRegressor()
model.fit(X_train, y_train)

# --- Predict
y_pred = model.predict(X_test)

# --- Feature importances
feature_importance = model.feature_importances_

shapley_values_actual = shapley.calc_shapley_values(X_test, y_test, "dcor")
shapley_values_xgb = shapley.calc_shapley_values(X_test, y_pred, "dcor")

print(shapley_values_actual)
print(shapley_values_xgb)
display_predictions()
display_feature_importances()
display_shapley()
Esempio n. 5
0
# --- Check for NaNs:
X_data = X_data.dropna(axis=0)
#print(X_data.isnull().any())

X_data = np.array(X_data[feats])
y_data = np.array(y_data)
y_data = y_data.T[0]
print(X_data.shape)
print(y_data.shape)

# --- Plot all data
#sns.pairplot(data_cont, hue='sex', size=2.5)
#plt.show()

x_range = list(range(X_data.shape[1]))
shapley_values = shapley.calc_shapley_values(X_data, y_data, x_range, "r2")
print(shapley_values)
shapley_values = shapley.calc_shapley_values(X_data, y_data, x_range, "dcor")
print(shapley_values)
display_shapley(X_data, y_data, cf=["r2"])
display_shapley(X_data, y_data, cf=["dcor"])
#plt.show()
#display_shapley(X_data, y_data, cf=["dcor", "aidc", "r2", "hsic"])

# NOTES
sys.exit()
data = pd.read_csv('processed.cleveland.data', sep=",", header=0)
data = data.iloc[:, [0, 1, 3, 4, 7, 9, 13]]
target = "num"
y = data[target].astype('int')
X = data.drop([target], axis=1)
Esempio n. 6
0
    _, ax = plt.subplots()
    if np.isnan(X_shapley).any():
        print("Data contains nan. Exiting")
        sys.exit()

    for _n, _cf in enumerate(["dcor", "r2", "aidc"]):
        print(_cf)
        _sfilename = "results/shapley_expl_{0}_{1}.pickle".format(
            _cf, modelname)

        if os.path.isfile(_sfilename):
            with open(_sfilename, 'rb') as _f:
                _shapley_values = pickle.load(_f)
        else:
            _shapley_values = shapley.calc_shapley_values(
                X_shapley, y_shapley, x_range, _cf)
            with open(_sfilename, 'wb') as _f:
                pickle.dump(_shapley_values, _f)

        print(_shapley_values)
        plt.bar(x_range + 0.1 * _n * np.ones(d),
                _shapley_values,
                alpha=0.5,
                label=_cf,
                width=0.1)

    plt.title("Target")
    plt.legend()
    ax.set_xticks(x_range)
    ax.set_xticklabels(labels, rotation=90)
    plt.draw()
Esempio n. 7
0
    2.6854023615295426, -0.7195432425919883, 0.4681875927260907,
    0.027206421899317337, 0.6396687066390925, 0.203168750153398,
    -1.0873217125552856
])


def aidc(x, y):
    cov_y = np.cov(y)
    cov_x = np.cov(x.T)

    if cov_x.shape is ():
        inv_cov_x = 1.0 / cov_x
        x_trans = np.dot(x, np.sqrt(inv_cov_x))
    else:
        inv_cov_x = np.linalg.inv(cov_x)
        x_trans = np.dot(x, scipy.linalg.sqrtm(inv_cov_x))
    inv_cov_y = 1 / cov_y
    y_trans = np.dot(y, np.sqrt(inv_cov_y))
    return dcor.distance_correlation(x_trans, y_trans)


#print(aidc(x, y))
#print(aidc(x1, y1))

#print(dcor.distance_correlation_af_inv(x, y))
#print(dcor.distance_correlation_af_inv(x1, y1))

#print(calc_shapley_values(np.array(x), np.array(y), cf_name="r2"))
#print(calc_shapley_values(np.array(x), np.array(y), cf_name="dcor"))
print(calc_shapley_values(np.array(x), np.array(y), cf_name="aidc"))
Esempio n. 8
0
def do_shapley(modelname, preds, labels):
    _sfilename = f"results/shapley_features_{modelname}.pickle"
    if not os.path.isfile(_sfilename):
        with open(_sfilename, 'wb') as _f:
            pickle.dump(labels, _f)

    # --- On data
    d = X_shapley.shape[1]
    x_range = list(range(d))

    _, ax = plt.subplots()
    if np.isnan(X_shapley).any():
        print("Data contains nan. Exiting")
        sys.exit()

    for _n, _cf in enumerate(["dcor", "r2", "aidc"]):
        print(_cf)
        _sfilename = "results/shapley_expl_{0}_{1}.pickle".format(
            _cf, modelname)

        if os.path.isfile(_sfilename):
            with open(_sfilename, 'rb') as _f:
                _shapley_values = pickle.load(_f)
        else:
            _shapley_values = shapley.calc_shapley_values(
                X_shapley, y_shapley, x_range, _cf)
            with open(_sfilename, 'wb') as _f:
                pickle.dump(_shapley_values, _f)

        print(_shapley_values)
        plt.bar(x_range + 0.1 * _n * np.ones(d),
                _shapley_values,
                alpha=0.5,
                label=_cf,
                width=0.1)

    plt.title("Target")
    plt.legend()
    ax.set_xticks(x_range)
    ax.set_xticklabels(labels, rotation=90)
    plt.draw()

    # --- On predictions
    X_shapley_pred = X_test.copy()
    X_shapley_pred["sex_isFemale"] = [
        1 if _x else 0 for _x in X_shapley_pred["sex_isFemale"]
    ]
    labels = X_shapley_pred.columns
    X_shapley_pred = np.array(X_shapley_pred[shapley_features])

    _, ax = plt.subplots()
    if np.isnan(X_shapley_pred).any():
        print("Data contains nan. Exiting")
        sys.exit()

    for _n, _cf in enumerate(["dcor", "r2", "aidc"]):
        print(_cf)
        _sfilename = "results/shapley_pred_{0}_{1}.pickle".format(
            _cf, modelname)

        if os.path.isfile(_sfilename):
            with open(_sfilename, 'rb') as _f:
                _shapley_values = pickle.load(_f)
        else:
            _shapley_values = shapley.calc_shapley_values(
                X_shapley_pred, preds, x_range, _cf)
        with open(_sfilename, 'wb') as _f:
            pickle.dump(_shapley_values, _f)

        print(_shapley_values)
        plt.bar(x_range + 0.1 * _n * np.ones(d),
                _shapley_values,
                alpha=0.5,
                label=_cf,
                width=0.1)

    plt.title("Predictions")
    plt.legend()
    ax.set_xticks(x_range)
    ax.set_xticklabels(labels, rotation=90)

    plt.draw()

    # --- On residuals
    _, ax = plt.subplots()
    residuals = y_test - preds
    print(X_shapley_pred.shape)
    print(residuals.shape)

    if np.isnan(X_shapley_pred).any():
        print("Data contains nan. Exiting")
        sys.exit()

    for _n, _cf in enumerate(["dcor", "r2", "aidc"]):
        print(_cf)
        _sfilename = "results/shapley_res_{0}_{1}.pickle".format(
            _cf, modelname)

        if os.path.isfile(_sfilename):
            with open(_sfilename, 'rb') as _f:
                _shapley_values = pickle.load(_f)
        else:
            _shapley_values = shapley.calc_shapley_values(
                X_shapley_pred, residuals, x_range, _cf)
        with open(_sfilename, 'wb') as _f:
            pickle.dump(_shapley_values, _f)

        print(_shapley_values)
        plt.bar(x_range + 0.1 * _n * np.ones(d),
                _shapley_values,
                alpha=0.5,
                label=_cf,
                width=0.1)

    plt.title("Residuals")
    plt.legend()
    ax.set_xticks(x_range)
    ax.set_xticklabels(labels, rotation=90)

    plt.draw()