Ejemplo n.º 1
class Explanation(object, metaclass=MetaExplanation):
    """ A slicable set of parallel arrays representing a SHAP explanation.
    def __init__(self,
        self.op_history = []

        # cloning. TODO: better cloning :)
        if issubclass(type(values), Explanation):
            e = values
            values = e.values
            base_values = e.base_values
            data = e.data

        output_dims = compute_output_dims(values, base_values, data)

        if len(
        ) == 1:  # TODO: should always be an alias once slicer supports per-row aliases
            values_shape = _compute_shape(values)
            if len(values_shape) >= 1 and len(
                    feature_names) == values_shape[0]:
                feature_names = Alias(list(feature_names), 0)
            elif len(values_shape) >= 2 and len(
                    feature_names) == values_shape[1]:
                feature_names = Alias(list(feature_names), 1)

        if len(
        ) == 1:  # TODO: should always be an alias once slicer supports per-row aliases
            values_shape = _compute_shape(values)
            if len(values_shape) >= 1 and len(output_names) == values_shape[0]:
                output_names = Alias(list(output_names), 0)
            elif len(values_shape) >= 2 and len(
                    output_names) == values_shape[1]:
                output_names = Alias(list(output_names), 1)

        self._s = Slicer(
            base_values=None if base_values is None else Obj(
                base_values, [0] + list(output_dims)),
            instance_names=None if instance_names is None else Alias(
                instance_names, 0),
            output_names,  # None if output_names is None else Alias(output_names, output_dims),
            output_indexes=None if output_indexes is None else
            (output_dims, output_indexes),
            clustering=None if clustering is None else Obj(clustering, [0]))

    def shape(self):
        return _compute_shape(self._s.values)

    def values(self):
        return self._s.values

    def values(self, new_values):
        self._s.values = new_values

    def base_values(self):
        return self._s.base_values

    def base_values(self, new_base_values):
        self._s.base_values = new_base_values

    def data(self):
        return self._s.data

    def data(self, new_data):
        self._s.data = new_data

    def display_data(self):
        return self._s.display_data

    def display_data(self, new_display_data):
        if issubclass(type(new_display_data), pd.DataFrame):
            new_display_data = new_display_data.values
        self._s.display_data = new_display_data

    def instance_names(self):
        return self._s.instance_names

    def output_names(self):
        return self._s.output_names

    def output_names(self, new_output_names):
        self._s.output_names = new_output_names

    def output_indexes(self):
        return self._s.output_indexes

    def feature_names(self):
        return self._s.feature_names

    def feature_names(self, new_feature_names):
        self._s.feature_names = new_feature_names

    def lower_bounds(self):
        return self._s.lower_bounds

    def upper_bounds(self):
        return self._s.upper_bounds

    def main_effects(self):
        return self._s.main_effects

    def main_effects(self, new_main_effects):
        self._s.main_effects = new_main_effects

    def hierarchical_values(self):
        return self._s.hierarchical_values

    def hierarchical_values(self, new_hierarchical_values):
        self._s.hierarchical_values = new_hierarchical_values

    def clustering(self):
        return self._s.clustering

    def clustering(self, new_clustering):
        self._s.clustering = new_clustering

    def cohorts(self, cohorts):
        """ Split this explanation into several cohorts.

        cohorts : int or array
            If this is an integer then we auto build that many cohorts using a decision tree. If this is
            an array then we treat that as an array of cohort names/ids for each instance.

        if isinstance(cohorts, int):
            return _auto_cohorts(self, max_cohorts=cohorts)
        elif isinstance(cohorts, (list, tuple, np.ndarray)):
            cohorts = np.array(cohorts)
            return Cohorts(
                **{name: self[cohorts == name]
                   for name in np.unique(cohorts)})
            raise Exception(
                "The given set of cohort indicators is not recognized! Please give an array or int."

    def __repr__(self):
        out = ".values =\n" + self.values.__repr__()
        if self.base_values is not None:
            out += "\n\n.base_values =\n" + self.base_values.__repr__()
        if self.data is not None:
            out += "\n\n.data =\n" + self.data.__repr__()
        return out

    def __getitem__(self, item):
        """ This adds support for magic string indexes like "rank(0)".
        if not isinstance(item, tuple):
            item = (item, )

        # convert any OpChains or magic strings
        for i, t in enumerate(item):
            orig_t = t
            if issubclass(type(t), OpChain):
                t = t.apply(self)
                if issubclass(
                     np.int32)):  # because slicer does not like numpy indexes
                    t = int(t)
                elif issubclass(type(t), np.ndarray):
                    t = [int(v) for v in t
                         ]  # slicer wants lists not numpy arrays for indexing
            elif issubclass(type(t), Explanation):
                t = t.values
            elif type(t) is str:
                if is_1d(self.feature_names):
                    ind = np.where(np.array(self.feature_names) == t)[0][0]
                    t = int(ind)
                    new_values = []
                    new_base_values = []
                    new_data = []
                    if self.output_names is not None and (
                            self.output_names.ndim >= 2
                            or self.output_names.shape[0] >= 2):
                        new_self = copy.deepcopy(self)
                        for i in range(len(self.values)):
                            for j in range(len(self.output_names[i])):
                                s = self.output_names[i][j]
                                if s == t:
                                        np.array(self.values[i][:, j]))
                        new_self = copy.deepcopy(self)
                        new_self.values = np.array(new_values)
                        new_self.base_values = np.array(new_base_values)
                        new_self.data = np.array(new_data)
                        new_self.output_names = t
                        new_self.feature_names = np.array(new_data)
                        new_self.clustering = None
                        for i in range(len(self.values)):
                            for s, v, d in zip(self.feature_names[i],
                                               self.values[i], self.data[i]):
                                if s == t:
                        new_self = copy.deepcopy(self)
                        new_self.values = new_values
                        new_self.data = new_data
                        new_self.feature_names = t
                        new_self.clustering = None
                    return new_self
            if issubclass(type(t), (np.int8, np.int16, np.int32, np.int64)):
                t = int(t)

            if t is not orig_t:
                tmp = list(item)
                tmp[i] = t
                item = tuple(tmp)

        # call slicer for the real work
        new_self = copy.copy(self)
        new_self._s = self._s.__getitem__(item)
            "name": "__getitem__",
            "args": (item, ),
            "prev_shape": self.shape

        return new_self

    def __len__(self):
        return self.shape[0]

    def __copy__(self):
        new_exp = Explanation(self.values, self.base_values, self.data,
                              self.display_data, self.instance_names,
                              self.feature_names, self.output_names,
                              self.output_indexes, self.lower_bounds,
                              self.upper_bounds, self.main_effects,
                              self.hierarchical_values, self.clustering)
        new_exp.op_history = copy.copy(self.op_history)
        return new_exp

    def _apply_binary_operator(self, other, binary_op, op_name):
        new_exp = self.__copy__()
        new_exp.op_history = copy.copy(self.op_history)
            "name": op_name,
            "args": (other, ),
            "prev_shape": self.shape
        if isinstance(other, Explanation):
            new_exp.values = binary_op(new_exp.values, other.values)
            if new_exp.data is not None:
                new_exp.data = binary_op(new_exp.data, other.data)
            if new_exp.base_values is not None:
                new_exp.base_values = binary_op(new_exp.base_values,
            new_exp.values = binary_op(new_exp.values, other)
            if new_exp.data is not None:
                new_exp.data = binary_op(new_exp.data, other)
            if new_exp.base_values is not None:
                new_exp.base_values = binary_op(new_exp.base_values, other)
        return new_exp

    def __add__(self, other):
        return self._apply_binary_operator(other, operator.add, "__add__")

    def __radd__(self, other):
        return self._apply_binary_operator(other, operator.add, "__add__")

    def __sub__(self, other):
        return self._apply_binary_operator(other, operator.sub, "__sub__")

    def __rsub__(self, other):
        return self._apply_binary_operator(other, operator.sub, "__sub__")

    def __mul__(self, other):
        return self._apply_binary_operator(other, operator.mul, "__mul__")

    def __rmul__(self, other):
        return self._apply_binary_operator(other, operator.mul, "__mul__")

    def __truediv__(self, other):
        return self._apply_binary_operator(other, operator.truediv,

    def abs(self):
        new_self = copy.copy(self)
        new_self.values = np.abs(new_self.values)
        new_self.op_history.append({"name": "abs", "prev_shape": self.shape})
        return new_self

    def _numpy_func(self, fname, **kwargs):
        new_self = copy.copy(self)
        axis = kwargs.get("axis", None)

        # collapse the slicer to right shape
        if axis == 0:
            new_self = new_self[0]
        elif axis == 1:
            new_self = new_self[1]
        elif axis == 2:
            new_self = new_self[2]
        if axis in [0, 1, 2]:
            new_self.op_history = new_self.op_history[:
                                                      -1]  # pop off the slicing operation we just used

        if self.feature_names is not None and not is_1d(
                self.feature_names) and axis == 0:
            new_values = self._flatten_feature_names()
            new_self.feature_names = np.array(list(new_values.keys()))
            new_self.values = np.array(
                [getattr(np, fname)(v, 0) for v in new_values.values()])
            new_self.clustering = None
            new_self.values = getattr(np, fname)(np.array(self.values),
            if new_self.data is not None:
                    new_self.data = getattr(np, fname)(np.array(self.data),
                    new_self.data = None
            if new_self.base_values is not None and issubclass(
                    type(axis), int) and len(self.base_values.shape) > axis:
                new_self.base_values = getattr(np, fname)(self.base_values,
            elif issubclass(type(axis), int):
                new_self.base_values = None

        if axis == 0 and self.clustering is not None and len(
                self.clustering.shape) == 3:
            if self.clustering.std(0).sum() < 1e-8:
                new_self.clustering = self.clustering[0]
                new_self.clustering = None

            "name": fname,
            "kwargs": kwargs,
            "prev_shape": self.shape,
            "collapsed_instances": axis == 0

        return new_self

    def mean(self, axis):
        return self._numpy_func("mean", axis=axis)

    def max(self, axis):
        return self._numpy_func("max", axis=axis)

    def min(self, axis):
        return self._numpy_func("min", axis=axis)

    def sum(self, axis=None, grouping=None):
        if grouping is None:
            return self._numpy_func("sum", axis=axis)
        elif axis == 1 or len(self.shape) == 1:
            return group_features(self, grouping)
            raise Exception(
                "Only axis = 1 is supported for grouping right now...")

    # def reshape(self, *args):
    #     return self._numpy_func("reshape", newshape=args)

    def abs(self):
        return self._numpy_func("abs")

    def identity(self):
        return self

    def argsort(self):
        return self._numpy_func("argsort")

    def flip(self):
        return self._numpy_func("flip")

    def hclust(self, metric="sqeuclidean", axis=0):
        """ Computes an optimal leaf ordering sort order using hclustering.
        metric : string
            A metric supported by scipy clustering.

        axis : int
            The axis to cluster along.
        values = self.values

        if len(values.shape) != 2:
            raise Exception(
                "The hclust order only supports 2D arrays right now!")

        if axis == 1:
            values = values.T

        # compute a hierarchical clustering and return the optimal leaf ordering
        D = sp.spatial.distance.pdist(values, metric)
        cluster_matrix = sp.cluster.hierarchy.complete(D)
        inds = sp.cluster.hierarchy.leaves_list(
            sp.cluster.hierarchy.optimal_leaf_ordering(cluster_matrix, D))
        return inds

    def sample(self, max_samples, replace=False, random_state=0):
        """ Randomly samples the instances (rows) of the Explanation object.

        max_samples : int
            The number of rows to sample. Note that if replace=False then less than
            fewer than max_samples will be drawn if explanation.shape[0] < max_samples.
        replace : bool
            Sample with or without replacement.
        prev_seed = np.random.seed(random_state)
        inds = np.random.choice(self.shape[0],
                                min(max_samples, self.shape[0]),
        return self[list(inds)]

    def _flatten_feature_names(self):
        new_values = {}
        for i in range(len(self.values)):
            for s, v in zip(self.feature_names[i], self.values[i]):
                if s not in new_values:
                    new_values[s] = []
        return new_values

    def _use_data_as_feature_names(self):
        new_values = {}
        for i in range(len(self.values)):
            for s, v in zip(self.data[i], self.values[i]):
                if s not in new_values:
                    new_values[s] = []
        return new_values

    def percentile(self, q, axis=None):
        new_self = copy.deepcopy(self)
        if self.feature_names is not None and not is_1d(
                self.feature_names) and axis == 0:
            new_values = self._flatten_feature_names()
            new_self.feature_names = np.array(list(new_values.keys()))
            new_self.values = np.array(
                [np.percentile(v, q) for v in new_values.values()])
            new_self.clustering = None
            new_self.values = np.percentile(new_self.values, q, axis)
            new_self.data = np.percentile(new_self.data, q, axis)
        #new_self.data = None
            "name": "percentile",
            "args": (axis, ),
            "prev_shape": self.shape,
            "collapsed_instances": axis == 0
        return new_self
Ejemplo n.º 2
class Explanation(object, metaclass=MetaExplanation):
    """ This is currently an experimental feature don't depend on this object yet! :)
    def __init__(self,
        self.transform_history = []

        # cloning. TODO: better cloning :)
        if issubclass(type(values), Explanation):
            e = values
            values = e.values
            base_values = e.base_values
            data = e.data

        output_dims = compute_output_dims(values, base_values, data)

        if len(
        ) == 1:  # TODO: should always be an alias once slicer supports per-row aliases
            values_shape = _compute_shape(values)
            if len(values_shape) >= 1 and len(
                    feature_names) == values_shape[0]:
                feature_names = Alias(feature_names, 0)
            elif len(values_shape) >= 2 and len(
                    feature_names) == values_shape[1]:
                feature_names = Alias(feature_names, 1)

        self._s = Slicer(
            instance_names=None if instance_names is None else Alias(
                instance_names, 0),
            output_names=None if output_names is None else Alias(
                output_names, output_dims),
            output_indexes=None if output_indexes is None else
            (output_dims, output_indexes),
            hierarchical_values,  #Obj(hierarchical_values, (0,None)),

    def shape(self):
        return _compute_shape(self._s.values)

    def values(self):
        return self._s.values

    def values(self, new_values):
        self._s.values = new_values

    def base_values(self):
        return self._s.base_values

    def base_values(self, new_base_values):
        self._s.base_values = new_base_values

    def data(self):
        return self._s.data

    def data(self, new_data):
        self._s.data = new_data

    def display_data(self):
        return self._s.display_data

    def display_data(self, new_display_data):
        self._s.display_data = new_display_data

    def instance_names(self):
        return self._s.instance_names

    def output_names(self):
        return self._s.output_names

    def output_indexes(self):
        return self._s.output_indexes

    def feature_names(self):
        return self._s.feature_names

    def feature_names(self, new_feature_names):
        self._s.feature_names = new_feature_names

    def lower_bounds(self):
        return self._s.lower_bounds

    def upper_bounds(self):
        return self._s.upper_bounds

    def main_effects(self):
        return self._s.main_effects

    def main_effects(self, new_main_effects):
        self._s.main_effects = new_main_effects

    def hierarchical_values(self):
        return self._s.hierarchical_values

    def hierarchical_values(self, new_hierarchical_values):
        self._s.hierarchical_values = new_hierarchical_values

    def clustering(self):
        return self._s.clustering

    def clustering(self, new_clustering):
        self._s.clustering = new_clustering

    def __repr__(self):
        out = ".values =\n" + self.values.__repr__()
        if self.base_values is not None:
            out += "\n\n.base_values =\n" + self.base_values.__repr__()
        if self.data is not None:
            out += "\n\n.data =\n" + self.data.__repr__()
        return out

    def __getitem__(self, item):
        """ This adds support for magic string indexes like "rank(0)".
        if not isinstance(item, tuple):
            item = (item, )

        # convert any OpChains or magic strings
        for i, t in enumerate(item):
            orig_t = t
            if issubclass(type(t), OpChain):
                t = t.apply(self)
                if issubclass(
                     np.int32)):  # because slicer does not like numpy indexes
                    t = int(t)
                elif issubclass(type(t), np.ndarray):
                    t = [int(v) for v in t
                         ]  # slicer wants lists not numpy arrays for indexing
            elif issubclass(type(t), Explanation):
                t = t.values
            elif type(t) is str:
                if is_1d(self.feature_names):
                    ind = np.where(np.array(self.feature_names) == t)[0][0]
                    t = int(ind)
                    new_values = []
                    new_data = []
                    for i in range(len(self.values)):
                        for s, v, d in zip(self.feature_names[i],
                                           self.values[i], self.data[i]):
                            if s == t:
                    new_self = copy.deepcopy(self)
                    new_self.values = new_values
                    new_self.data = new_data
                    new_self.feature_names = t
                    new_self.clustering = None
                    return new_self
            if issubclass(type(t), np.ndarray):
                t = [int(j) for j in t]
            elif issubclass(type(t), (np.int8, np.int16, np.int32, np.int64)):
                t = int(t)

            if t is not orig_t:
                tmp = list(item)
                tmp[i] = t
                item = tuple(tmp)

        # call slicer for the real work
        new_self = copy.copy(self)
        new_self.transform_history.append(("__getitem__", (item, )))
        new_self._s = self._s.__getitem__(item)

        return new_self

    def __len__(self):
        return self.shape[0]

    def __copy__(self):
        return Explanation(self.values, self.base_values, self.data,
                           self.display_data, self.instance_names,
                           self.feature_names, self.output_names,
                           self.output_indexes, self.lower_bounds,
                           self.upper_bounds, self.main_effects,
                           self.hierarchical_values, self.clustering)

    def abs(self):
        new_self = copy.copy(self)
        new_self.values = np.abs(new_self.values)
        new_self.transform_history.append(("abs", None))
        return new_self

    def _numpy_func(self, fname, **kwargs):
        new_self = copy.copy(self)
        axis = kwargs.get("axis", None)

        # collapse the slicer to right shape
        if axis == 0:
            new_self = new_self[0]
        elif axis == 1:
            new_self = new_self[1]
        elif axis == 2:
            new_self = new_self[2]

        if self.feature_names is not None and not is_1d(
                self.feature_names) and axis == 0:
            new_values = self._flatten_feature_names()
            new_self.feature_names = np.array(list(new_values.keys()))
            new_self.values = np.array(
                [getattr(np, fname)(v) for v in new_values.values()])
            new_self.clustering = None
            new_self.values = getattr(np, fname)(np.array(self.values),
            if new_self.data is not None:
                    new_self.data = getattr(np, fname)(np.array(self.data),
                    new_self.data = None
            if new_self.base_values is not None and issubclass(
                    type(axis), int) and len(self.base_values.shape) > axis:
                new_self.base_values = getattr(np, fname)(self.base_values,
            elif issubclass(type(axis), int):
                new_self.base_values = None

        if axis == 0 and self.clustering is not None and len(
                self.clustering.shape) == 3:
            if self.clustering.std(0).sum() < 1e-8:
                new_self.clustering = self.clustering[0]
                new_self.clustering = None

        new_self.transform_history.append((fname, kwargs))

        return new_self

    def mean(self, axis):
        return self._numpy_func("mean", axis=axis)

    def max(self, axis):
        return self._numpy_func("max", axis=axis)

    def min(self, axis):
        return self._numpy_func("min", axis=axis)

    def sum(self, axis):
        return self._numpy_func("sum", axis=axis)

    def abs(self):
        return self._numpy_func("abs")

    def argsort(self):
        return self._numpy_func("argsort")

    def flip(self):
        return self._numpy_func("flip")

    def hclust(self, metric="sqeuclidean", axis=0):
        """ Computes an optimal leaf ordering sort order using hclustering.
        metric : string
            A metric supported by scipy clustering.

        axis : int
            The axis to cluster along.
        values = self.values

        if len(values.shape) != 2:
            raise Exception(
                "The hclust order only supports 2D arrays right now!")

        if axis == 1:
            values = values.T

        # compute a hierarchical clustering and return the optimal leaf ordering
        D = sp.spatial.distance.pdist(values, metric)
        cluster_matrix = sp.cluster.hierarchy.complete(D)
        inds = sp.cluster.hierarchy.leaves_list(
            sp.cluster.hierarchy.optimal_leaf_ordering(cluster_matrix, D))
        return inds

    def sample(self, max_samples, replace=False, random_state=0):
        """ Randomly samples the instances (rows) of the Explanation object.

        max_samples : int
            The number of rows to sample. Note that if replace=False then less than
            fewer than max_samples will be drawn if explanation.shape[0] < max_samples.
        replace : bool
            Sample with or without replacement.
        prev_seed = np.random.seed(random_state)
        inds = np.random.choice(self.shape[0],
                                min(max_samples, self.shape[0]),
        return self[list(inds)]

    def _flatten_feature_names(self):
        new_values = {}
        for i in range(len(self.values)):
            for s, v in zip(self.feature_names[i], self.values[i]):
                if s not in new_values:
                    new_values[s] = []
        return new_values

    def _use_data_as_feature_names(self):
        new_values = {}
        for i in range(len(self.values)):
            for s, v in zip(self.data[i], self.values[i]):
                if s not in new_values:
                    new_values[s] = []
        return new_values

    def percentile(self, q, axis=None):
        new_self = copy.deepcopy(self)
        if self.feature_names is not None and not is_1d(
                self.feature_names) and axis == 0:
            new_values = self._flatten_feature_names()
            new_self.feature_names = np.array(list(new_values.keys()))
            new_self.values = np.array(
                [np.percentile(v, q) for v in new_values.values()])
            new_self.clustering = None
            new_self.values = np.percentile(new_self.values, q, axis)
            new_self.data = np.percentile(new_self.data, q, axis)
        #new_self.data = None
        new_self.transform_history.append(("percentile", (axis, )))
        return new_self