def get_data(kind='array', n_rows=15, n_cols=49, fnames=None, seed=None): """ Generates random data with a specified type for the purposes of testing grouping functionality of the wrapper. """ np.random.seed(seed) if kind not in SUPPORTED_BACKGROUND_DATA_TYPES: msg = "Selected data type, {}, is not an allowed type. " \ "Allowed types are {}" raise ValueError(msg.format(kind, SUPPORTED_BACKGROUND_DATA_TYPES)) X = get_random_matrix(n_rows=n_rows, n_cols=n_cols) if kind == 'array': return X elif kind == 'sparse': return scipy.sparse.csr_matrix(X) elif kind == 'frame' or kind == 'series': if not fnames: fnames = ['feature_{}'.format(i) for i in range(X.shape[-1])] if kind == 'frame': return pd.DataFrame(data=X, columns=fnames) else: idx = np.random.choice(np.arange(X.shape[0])) return pd.DataFrame(data=X, columns=fnames).iloc[idx, :] elif kind == 'data': if not fnames: group_names = ['feature_{}'.format(i) for i in range(X.shape[-1])] else: group_names = fnames return DenseData(X, group_names) else: return 0
def test_sum_categories(n_feats, feat_enc_dim, start_idx): """ Tests if summing the columns corresponding to categorical variables into one variable works properly. """ # create inputs to feed the function X = get_random_matrix(n_cols=n_feats) # check a value correct is raised if start indices or # encoding lengths are not provided if feat_enc_dim is None or start_idx is None: with pytest.raises(ValueError) as exc_info: summ_X = sum_categories(X, start_idx, feat_enc_dim) assert exc_info.type is ValueError elif len(feat_enc_dim) != len(start_idx): with pytest.raises(ValueError) as exc_info: summ_X = sum_categories(X, start_idx, feat_enc_dim) assert exc_info.type is ValueError # check if sum of encodings greater than num columns raises value correct elif sum(feat_enc_dim) > n_feats: with pytest.raises(ValueError) as exc_info: summ_X = sum_categories(X, start_idx, feat_enc_dim) assert exc_info.type is ValueError # check that if inputs are correct, we retrieve the sum in the correct col else: summ_X = sum_categories(X, start_idx, feat_enc_dim) assert summ_X.shape[1] == X.shape[1] - sum(feat_enc_dim) + len( feat_enc_dim) for i, enc_dim in enumerate(feat_enc_dim): # work out the index of the summed column in the returned matrix sum_col_idx = start_idx[i] - sum(feat_enc_dim[:i]) + len( feat_enc_dim[:i]) diff = summ_X[:, sum_col_idx] - np.sum( X[:, start_idx[i]:start_idx[i] + feat_enc_dim[i]], axis=1) assert diff.sum() == 0.0
def test_rank_by_importance(mock_ks_explainer, data_dimension): """ Tests the feature effects ranking function. """ def get_column_ranks(X, ascending=False): """ Ranks the columns of X according to the average magnitude value and returns an array of ranking indices and a an array of sorted values according to the ranking. """ avg_mag = np.mean(np.abs(X), axis=0) rank = np.argsort(avg_mag) if ascending: return rank, avg_mag[rank] else: return rank[::-1], avg_mag[rank][::-1] # setup explainer n_samples, n_features = data_dimension explainer = mock_ks_explainer explainer.feature_names = gen_group_names(n_features) # create inputs n_outs = explainer.predictor.out_dim shap_values = [ get_random_matrix(n_rows=n_samples, n_cols=n_features) for _ in range(n_outs) ] # compute desired values exp_ranked_effects_class = {} expected_feat_names_order = {} ranks_and_vals = [ get_column_ranks(class_shap_vals) for class_shap_vals in shap_values ] ranks, vals = list(zip(*ranks_and_vals)) for i, values in enumerate(vals): exp_ranked_effects_class[str(i)] = vals[i] expected_feat_names_order[str(i)] = [ explainer.feature_names[k] for k in ranks[i] ] aggregate_shap = np.sum(shap_values, axis=0) exp_aggregate_rank, exp_ranked_effects_aggregate = get_column_ranks( aggregate_shap) exp_aggregate_names = [ explainer.feature_names[k] for k in exp_aggregate_rank ] # check results importances = explainer.rank_by_importance(shap_values) assert len(importances.keys()) == n_outs + 1 for key in importances: if key != 'aggregated': assert_allclose(importances[key]['ranked_effect'], exp_ranked_effects_class[key]) assert importances[key]['names'] == expected_feat_names_order[key] else: assert_allclose(importances[key]['ranked_effect'], exp_ranked_effects_aggregate) assert importances[key]['names'] == exp_aggregate_names