def display_shapley(X_train, y_train, cf=["dcor"], xlabels=None): if not isinstance(cf, list): cf = list(cf) d = X_train.shape[1] x_range = list(range(d)) _, ax = plt.subplots() ax = nice_axes(ax) for _n, _cf in enumerate(cf): print(_cf) _shapley_values = shapley.calc_shapley_values(X_train, y_train, _cf) #_shapley_values = normalise(_shapley_values) plt.bar(x_range + 0.1 * _n * numpy.ones(d), _shapley_values, alpha=0.5, label=_cf, width=0.1) #if _n < 1: #_shapley_values, xlabels = [list(t) for t in zip(*sorted(zip(_shapley_values, xlabels), reverse=True))] if xlabels is None: x_labels = x_range ax.set_xticks(x_range) ax.set_xticklabels(xlabels, rotation=90) plt.title(r"Shapley decomposition of {0} on training data".format(cf)) plt.legend() plt.draw()
def display_shapley_vs_xgb(X_test, y_test, y_pred, cf="dcor"): d = X_test.shape[1] shapley_values_actual = shapley.calc_shapley_values(X_test, y_test, cf) shapley_values_xgb = shapley.calc_shapley_values(X_test, y_pred, cf) _, ax = plt.subplots() ax = nice_axes(ax) plt.title( r"Shapley decomposition of {0} on $X$ with true vs predicted $Y$". format(cf)) plt.bar(range(len(shapley_values_actual)), shapley_values_actual, color="red", alpha=0.5, label="True") plt.bar(range(len(shapley_values_xgb)), shapley_values_xgb, color="blue", alpha=0.5, label="Predicted") plt.legend() plt.draw()
def display_residuals_shapley(x, residuals, cf="dcor"): d = x.shape[1] shapley_values_residuals = shapley.calc_shapley_values(x, residuals, cf) _, ax = plt.subplots() ax = shapley.nice_axes(ax) plt.bar(range(len(shapley_values_residuals)), shapley_values_residuals, color="red", alpha=0.5) plt.title( r"Shapley decomposition of {0} on $X$ with the residuals (true-predicted $Y$)" .format(cf)) plt.draw()
sigma = 0.2 X = numpy.array([numpy.random.uniform(-1, 1, N) for _ in range(D)]).T X[:, 1] = X[:, 0] + numpy.random.normal(0, sigma, N) X[:, 2] = X[:, 0] + numpy.random.normal(0, sigma, N) Y = numpy.matmul(numpy.multiply(X, X), numpy.ones(D)) # --- X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=7) # --- Fit model model = XGBRegressor() model.fit(X_train, y_train) # --- Predict y_pred = model.predict(X_test) # --- Feature importances feature_importance = model.feature_importances_ shapley_values_actual = shapley.calc_shapley_values(X_test, y_test, "dcor") shapley_values_xgb = shapley.calc_shapley_values(X_test, y_pred, "dcor") print(shapley_values_actual) print(shapley_values_xgb) display_predictions() display_feature_importances() display_shapley()
# --- Check for NaNs: X_data = X_data.dropna(axis=0) #print(X_data.isnull().any()) X_data = np.array(X_data[feats]) y_data = np.array(y_data) y_data = y_data.T[0] print(X_data.shape) print(y_data.shape) # --- Plot all data #sns.pairplot(data_cont, hue='sex', size=2.5) #plt.show() x_range = list(range(X_data.shape[1])) shapley_values = shapley.calc_shapley_values(X_data, y_data, x_range, "r2") print(shapley_values) shapley_values = shapley.calc_shapley_values(X_data, y_data, x_range, "dcor") print(shapley_values) display_shapley(X_data, y_data, cf=["r2"]) display_shapley(X_data, y_data, cf=["dcor"]) #plt.show() #display_shapley(X_data, y_data, cf=["dcor", "aidc", "r2", "hsic"]) # NOTES sys.exit() data = pd.read_csv('processed.cleveland.data', sep=",", header=0) data = data.iloc[:, [0, 1, 3, 4, 7, 9, 13]] target = "num" y = data[target].astype('int') X = data.drop([target], axis=1)
_, ax = plt.subplots() if np.isnan(X_shapley).any(): print("Data contains nan. Exiting") sys.exit() for _n, _cf in enumerate(["dcor", "r2", "aidc"]): print(_cf) _sfilename = "results/shapley_expl_{0}_{1}.pickle".format( _cf, modelname) if os.path.isfile(_sfilename): with open(_sfilename, 'rb') as _f: _shapley_values = pickle.load(_f) else: _shapley_values = shapley.calc_shapley_values( X_shapley, y_shapley, x_range, _cf) with open(_sfilename, 'wb') as _f: pickle.dump(_shapley_values, _f) print(_shapley_values) plt.bar(x_range + 0.1 * _n * np.ones(d), _shapley_values, alpha=0.5, label=_cf, width=0.1) plt.title("Target") plt.legend() ax.set_xticks(x_range) ax.set_xticklabels(labels, rotation=90) plt.draw()
2.6854023615295426, -0.7195432425919883, 0.4681875927260907, 0.027206421899317337, 0.6396687066390925, 0.203168750153398, -1.0873217125552856 ]) def aidc(x, y): cov_y = np.cov(y) cov_x = np.cov(x.T) if cov_x.shape is (): inv_cov_x = 1.0 / cov_x x_trans = np.dot(x, np.sqrt(inv_cov_x)) else: inv_cov_x = np.linalg.inv(cov_x) x_trans = np.dot(x, scipy.linalg.sqrtm(inv_cov_x)) inv_cov_y = 1 / cov_y y_trans = np.dot(y, np.sqrt(inv_cov_y)) return dcor.distance_correlation(x_trans, y_trans) #print(aidc(x, y)) #print(aidc(x1, y1)) #print(dcor.distance_correlation_af_inv(x, y)) #print(dcor.distance_correlation_af_inv(x1, y1)) #print(calc_shapley_values(np.array(x), np.array(y), cf_name="r2")) #print(calc_shapley_values(np.array(x), np.array(y), cf_name="dcor")) print(calc_shapley_values(np.array(x), np.array(y), cf_name="aidc"))
def do_shapley(modelname, preds, labels): _sfilename = f"results/shapley_features_{modelname}.pickle" if not os.path.isfile(_sfilename): with open(_sfilename, 'wb') as _f: pickle.dump(labels, _f) # --- On data d = X_shapley.shape[1] x_range = list(range(d)) _, ax = plt.subplots() if np.isnan(X_shapley).any(): print("Data contains nan. Exiting") sys.exit() for _n, _cf in enumerate(["dcor", "r2", "aidc"]): print(_cf) _sfilename = "results/shapley_expl_{0}_{1}.pickle".format( _cf, modelname) if os.path.isfile(_sfilename): with open(_sfilename, 'rb') as _f: _shapley_values = pickle.load(_f) else: _shapley_values = shapley.calc_shapley_values( X_shapley, y_shapley, x_range, _cf) with open(_sfilename, 'wb') as _f: pickle.dump(_shapley_values, _f) print(_shapley_values) plt.bar(x_range + 0.1 * _n * np.ones(d), _shapley_values, alpha=0.5, label=_cf, width=0.1) plt.title("Target") plt.legend() ax.set_xticks(x_range) ax.set_xticklabels(labels, rotation=90) plt.draw() # --- On predictions X_shapley_pred = X_test.copy() X_shapley_pred["sex_isFemale"] = [ 1 if _x else 0 for _x in X_shapley_pred["sex_isFemale"] ] labels = X_shapley_pred.columns X_shapley_pred = np.array(X_shapley_pred[shapley_features]) _, ax = plt.subplots() if np.isnan(X_shapley_pred).any(): print("Data contains nan. Exiting") sys.exit() for _n, _cf in enumerate(["dcor", "r2", "aidc"]): print(_cf) _sfilename = "results/shapley_pred_{0}_{1}.pickle".format( _cf, modelname) if os.path.isfile(_sfilename): with open(_sfilename, 'rb') as _f: _shapley_values = pickle.load(_f) else: _shapley_values = shapley.calc_shapley_values( X_shapley_pred, preds, x_range, _cf) with open(_sfilename, 'wb') as _f: pickle.dump(_shapley_values, _f) print(_shapley_values) plt.bar(x_range + 0.1 * _n * np.ones(d), _shapley_values, alpha=0.5, label=_cf, width=0.1) plt.title("Predictions") plt.legend() ax.set_xticks(x_range) ax.set_xticklabels(labels, rotation=90) plt.draw() # --- On residuals _, ax = plt.subplots() residuals = y_test - preds print(X_shapley_pred.shape) print(residuals.shape) if np.isnan(X_shapley_pred).any(): print("Data contains nan. Exiting") sys.exit() for _n, _cf in enumerate(["dcor", "r2", "aidc"]): print(_cf) _sfilename = "results/shapley_res_{0}_{1}.pickle".format( _cf, modelname) if os.path.isfile(_sfilename): with open(_sfilename, 'rb') as _f: _shapley_values = pickle.load(_f) else: _shapley_values = shapley.calc_shapley_values( X_shapley_pred, residuals, x_range, _cf) with open(_sfilename, 'wb') as _f: pickle.dump(_shapley_values, _f) print(_shapley_values) plt.bar(x_range + 0.1 * _n * np.ones(d), _shapley_values, alpha=0.5, label=_cf, width=0.1) plt.title("Residuals") plt.legend() ax.set_xticks(x_range) ax.set_xticklabels(labels, rotation=90) plt.draw()