def get_data(kind='array', n_rows=15, n_cols=49, fnames=None, seed=None):
    """
    Generates random data with a specified type for the purposes
    of testing grouping functionality of the wrapper.
    """

    np.random.seed(seed)

    if kind not in SUPPORTED_BACKGROUND_DATA_TYPES:
        msg = "Selected data type, {}, is not an allowed type. " \
              "Allowed types are {}"
        raise ValueError(msg.format(kind, SUPPORTED_BACKGROUND_DATA_TYPES))

    X = get_random_matrix(n_rows=n_rows, n_cols=n_cols)

    if kind == 'array':
        return X
    elif kind == 'sparse':
        return scipy.sparse.csr_matrix(X)
    elif kind == 'frame' or kind == 'series':
        if not fnames:
            fnames = ['feature_{}'.format(i) for i in range(X.shape[-1])]
        if kind == 'frame':
            return pd.DataFrame(data=X, columns=fnames)
        else:
            idx = np.random.choice(np.arange(X.shape[0]))
            return pd.DataFrame(data=X, columns=fnames).iloc[idx, :]
    elif kind == 'data':
        if not fnames:
            group_names = ['feature_{}'.format(i) for i in range(X.shape[-1])]
        else:
            group_names = fnames
        return DenseData(X, group_names)
    else:
        return 0
def test_sum_categories(n_feats, feat_enc_dim, start_idx):
    """
    Tests if summing the columns corresponding to categorical
    variables into one variable works properly.
    """

    # create inputs to feed the function
    X = get_random_matrix(n_cols=n_feats)

    # check a value correct is raised if start indices or
    # encoding lengths are not provided
    if feat_enc_dim is None or start_idx is None:
        with pytest.raises(ValueError) as exc_info:
            summ_X = sum_categories(X, start_idx, feat_enc_dim)
            assert exc_info.type is ValueError
    elif len(feat_enc_dim) != len(start_idx):
        with pytest.raises(ValueError) as exc_info:
            summ_X = sum_categories(X, start_idx, feat_enc_dim)
            assert exc_info.type is ValueError

    # check if sum of encodings greater than num columns raises value correct
    elif sum(feat_enc_dim) > n_feats:
        with pytest.raises(ValueError) as exc_info:
            summ_X = sum_categories(X, start_idx, feat_enc_dim)
            assert exc_info.type is ValueError

    # check that if inputs are correct, we retrieve the sum in the correct col
    else:
        summ_X = sum_categories(X, start_idx, feat_enc_dim)
        assert summ_X.shape[1] == X.shape[1] - sum(feat_enc_dim) + len(
            feat_enc_dim)
        for i, enc_dim in enumerate(feat_enc_dim):
            # work out the index of the summed column in the returned matrix
            sum_col_idx = start_idx[i] - sum(feat_enc_dim[:i]) + len(
                feat_enc_dim[:i])
            diff = summ_X[:, sum_col_idx] - np.sum(
                X[:, start_idx[i]:start_idx[i] + feat_enc_dim[i]], axis=1)
            assert diff.sum() == 0.0
def test_rank_by_importance(mock_ks_explainer, data_dimension):
    """
    Tests the feature effects ranking function.
    """
    def get_column_ranks(X, ascending=False):
        """
        Ranks the columns of X according to the average magnitude value
        and returns an array of ranking indices and a an array of
        sorted values according to the ranking.
        """

        avg_mag = np.mean(np.abs(X), axis=0)
        rank = np.argsort(avg_mag)
        if ascending:
            return rank, avg_mag[rank]
        else:
            return rank[::-1], avg_mag[rank][::-1]

    # setup explainer
    n_samples, n_features = data_dimension
    explainer = mock_ks_explainer
    explainer.feature_names = gen_group_names(n_features)

    # create inputs
    n_outs = explainer.predictor.out_dim
    shap_values = [
        get_random_matrix(n_rows=n_samples, n_cols=n_features)
        for _ in range(n_outs)
    ]

    # compute desired values
    exp_ranked_effects_class = {}
    expected_feat_names_order = {}
    ranks_and_vals = [
        get_column_ranks(class_shap_vals) for class_shap_vals in shap_values
    ]
    ranks, vals = list(zip(*ranks_and_vals))
    for i, values in enumerate(vals):
        exp_ranked_effects_class[str(i)] = vals[i]
        expected_feat_names_order[str(i)] = [
            explainer.feature_names[k] for k in ranks[i]
        ]
    aggregate_shap = np.sum(shap_values, axis=0)
    exp_aggregate_rank, exp_ranked_effects_aggregate = get_column_ranks(
        aggregate_shap)
    exp_aggregate_names = [
        explainer.feature_names[k] for k in exp_aggregate_rank
    ]

    # check results
    importances = explainer.rank_by_importance(shap_values)
    assert len(importances.keys()) == n_outs + 1
    for key in importances:
        if key != 'aggregated':
            assert_allclose(importances[key]['ranked_effect'],
                            exp_ranked_effects_class[key])
            assert importances[key]['names'] == expected_feat_names_order[key]
        else:
            assert_allclose(importances[key]['ranked_effect'],
                            exp_ranked_effects_aggregate)
            assert importances[key]['names'] == exp_aggregate_names