Esempio n. 1
0
    def model_profile(self,
                      type=('partial', 'accumulated', 'conditional'),
                      N=500,
                      variables=None,
                      variable_type='numerical',
                      groups=None,
                      span=0.25,
                      grid_points=101,
                      intercept=True,
                      processes=1,
                      random_state=None,
                      verbose=True):
        """Dataset Level Variable Effect as Partial Dependency Profile or Accumulated Local Effects

        :param ceteris_paribus: a ceteris paribus explainer or list of ceteris paribus explainers
        :param N: number of observations used for calculation of partial dependency profiles. By default, 500 observations will be chosen randomly. If None then all observations will be used.
        :param variables: str or list or numpy.ndarray or pandas.Series, if not None then aggregate only for selected variables will be calculated, if None all will be selected
        :param variable_type: TODO If "numerical" then only numerical variables will be calculated. If "categorical" then only categorical variables will be calculated.
        :param groups: str or list or numpy.ndarray or pandas.Series, a variable names that will be used for grouping
        :param type: either partial/conditional/accumulated for partial dependence, conditional profiles of accumulated local effects
        :param span: smoothing coeffcient, by default 0.25.It's the sd for gaussian kernel
        :param grid_points: number of points for profile
        :param intercept: False if center data on 0
        :param processes: integer, number of parallel processes, iterated over variables
        :param random_state: int, seed for random number generator
        :param verbose: print tqdm progress bar
        :return: VariableEffect object
        """

        types = ('partial', 'accumulated', 'conditional')
        type = check_method_type(type, types)

        if N is None:
            N = self.data.shape[0]
        else:
            N = min(N, self.data.shape[0])

        if random_state is not None:
            np.random.seed(random_state)

        I = np.random.choice(np.arange(N), N, replace=False)

        ceteris_paribus = CeterisParibus(grid_points=grid_points,
                                         processes=processes)
        ceteris_paribus.fit(self,
                            self.data.iloc[I, :],
                            self.y[I],
                            verbose=verbose)

        model_profile_ = AggregatedProfiles(type=type,
                                            variables=variables,
                                            variable_type=variable_type,
                                            groups=groups,
                                            span=span,
                                            intercept=intercept,
                                            random_state=random_state)

        model_profile_.fit(ceteris_paribus, verbose)

        return model_profile_
Esempio n. 2
0
    def predict_profile(self,
                        new_observation,
                        type=('ceteris_paribus', ),
                        y=None,
                        variables=None,
                        grid_points=101,
                        variable_splits=None,
                        processes=1):
        """Creates CeterisParibus object

        :param new_observation: DataFrame with observations for which the profiles will be calculated
        :param type TODO
        :param y: pandas Series with labels for the observations
        :param variables: variables for which the profiles are calculated
        :param grid_points: number of points in a single variable split if calculated automatically
        :param variable_splits: mapping of variables into points the profile will be calculated, if None then calculate with the function `_calculate_variable_splits`
        :param processes: integer, number of parallel processes, iterated over variables
        :return CeterisParibus object
        """

        types = ('ceteris_paribus', )
        type = check_method_type(type, types)

        if type == 'ceteris_paribus':
            predict_profile_ = CeterisParibus(variables=variables,
                                              grid_points=grid_points,
                                              variable_splits=variable_splits,
                                              processes=processes)

        predict_profile_.fit(self, new_observation, y)

        return predict_profile_
Esempio n. 3
0
    def model_profile(self,
                      type=('partial', 'accumulated', 'conditional'),
                      N=500,
                      variables=None,
                      variable_type='numerical',
                      groups=None,
                      span=0.25,
                      grid_points=101,
                      intercept=True):

        """Dataset Level Variable Effect as Partial Dependency Profile or Accumulated Local Effects

        :param ceteris_paribus: a ceteris paribus explainer or list of ceteris paribus explainers
        :param N: number of observations used for calculation of partial dependency profiles. By default, 500 observations will be chosen randomly.
        :param variables: names of variables for which profiles shall be calculated.
        :param variable_type: TODO If "numerical" then only numerical variables will be calculated. If "categorical" then only categorical variables will be calculated.
        :param groups: a variable name that will be used for grouping.
        :param type: either partial/conditional/accumulated for partial dependence, conditional profiles of accumulated local effects
        :param span: smoothing coeffcient, by default 0.25.It's the sd for gaussian kernel
        :param grid_points: number of points for profile
        :param intercept: False if center data on 0
        :return: VariableEffect object
        """

        types = ('partial', 'accumulated', 'conditional')
        type = check_method_type(type, types)

        N = min(N, self.data.shape[0])
        I = np.random.choice(np.arange(N), N, replace=False)

        ceteris_paribus = CeterisParibus(grid_points=grid_points)
        ceteris_paribus.fit(self, self.data.iloc[I, :], self.y[I])

        model_profile_ = AggregatedProfiles(
            type=type,
            variables=variables,
            variable_type=variable_type,
            groups=groups,
            span=span,
            intercept=intercept
        )

        model_profile_.fit(ceteris_paribus)

        return model_profile_
Esempio n. 4
0
    def model_profile(self,
                      type=('partial', 'accumulated', 'conditional'),
                      N=300,
                      variables=None,
                      variable_type='numerical',
                      groups=None,
                      span=0.25,
                      grid_points=101,
                      variable_splits=None,
                      center=True,
                      processes=1,
                      random_state=None,
                      verbose=True):
        """Calculate dataset level variable profiles as Partial or Accumulated Dependence

        Parameters
        -----------
        type : {'partial', 'accumulated', 'conditional'}
            Type of model profiles (default is 'partial' for Partial Dependence Profiles).
        N : int, optional
            Number of observations that will be sampled from the `data` attribute before
            the calculation of variable profiles. None means all `data` (default is 300).
        variables : str or array_like of str, optional
            Variables for which the profiles will be calculated
            (default is None, which means all of the variables).
        variable_type : {'numerical', 'categorical'}
            Calculate the profiles for numerical or categorical variables
            (default is 'numerical').
        groups : str or array_like of str, optional
            Names of categorical variables that will be used for profile grouping
            (default is None, which means no grouping).
        span : float, optional
            Smoothing coefficient used as sd for gaussian kernel (default is 0.25).
        grid_points : int, optional
            Maximum number of points for profile calculations (default is 101).
            NOTE: The final number of points may be lower than `grid_points`,
            eg. if there is not enough unique values for a given variable.
        variable_splits : dict of lists, optional
            Split points for variables e.g. {'x': [0, 0.2, 0.5, 0.8, 1], 'y': ['a', 'b']}
            (default is None, which means that they will be distributed uniformly).
        center : bool, optional
            Theoretically Accumulated Profiles start at 0, but are centered to compare
            them with Partial Dependence Profiles (default is True, which means center
            around the average y_hat calculated on the data sample).
        processes : int, optional
            Number of parallel processes to use in calculations. Iterated over `variables`
            (default is 1, which means no parallel computation).
        random_state : int, optional
            Set seed for random number generator (default is random seed).
        verbose : bool, optional
            Print tqdm progress bar (default is True).

        Returns
        -----------
        AggregatedProfiles class object
            Explanation object containing the main result attribute and the plot method.

        Notes
        --------
        https://pbiecek.github.io/ema/partialDependenceProfiles.html
        https://pbiecek.github.io/ema/accumulatedLocalProfiles.html
        """

        types = ('partial', 'accumulated', 'conditional')
        type = check_method_type(type, types)

        if N is None:
            N = self.data.shape[0]
        else:
            N = min(N, self.data.shape[0])

        if random_state is not None:
            np.random.seed(random_state)

        I = np.random.choice(np.arange(N), N, replace=False)

        ceteris_paribus = CeterisParibus(grid_points=grid_points,
                                         variables=variables,
                                         variable_splits=variable_splits,
                                         variable_splits_type='uniform',
                                         processes=processes)
        ceteris_paribus.fit(self,
                            self.data.iloc[I, :],
                            self.y[I],
                            verbose=verbose)

        model_profile_ = AggregatedProfiles(type=type,
                                            variables=variables,
                                            variable_type=variable_type,
                                            groups=groups,
                                            span=span,
                                            center=center,
                                            random_state=random_state)

        model_profile_.fit(ceteris_paribus, verbose)

        return model_profile_
Esempio n. 5
0
    def predict_profile(self,
                        new_observation,
                        type=('ceteris_paribus', ),
                        y=None,
                        variables=None,
                        grid_points=101,
                        variable_splits=None,
                        variable_splits_type='uniform',
                        variable_splits_with_obs=True,
                        processes=1,
                        verbose=True):
        """Calculate instance level variable profiles as Ceteris Paribus

        Parameters
        -----------
        new_observation : pd.DataFrame or np.ndarray or pd.Series
            Observations for which predictions need to be explained.
        type : {'ceteris_paribus', TODO: 'oscilations'}
            Type of variable profiles (default is 'ceteris_paribus').
        y : pd.Series or np.ndarray (1d), optional
            Target variable with the same length as `new_observation`.
        variables : str or array_like of str, optional
            Variables for which the profiles will be calculated
            (default is None, which means all of the variables).
        grid_points : int, optional
            Maximum number of points for profile calculations (default is 101).
            NOTE: The final number of points may be lower than `grid_points`,
            eg. if there is not enough unique values for a given variable.
        variable_splits : dict of lists, optional
            Split points for variables e.g. {'x': [0, 0.2, 0.5, 0.8, 1], 'y': ['a', 'b']}
            (default is None, which means that they will be calculated using one of
            `variable_splits_type` and the `data` attribute).
        variable_splits_type : {'uniform', 'quantiles'}, optional
            Way of calculating `variable_splits`. Set 'quantiles' for percentiles.
            (default is 'uniform', which means uniform grid of points).
        variable_splits_with_obs: bool, optional
            Add variable values of `new_observation` data to the `variable_splits`
            (default is True).
        processes : int, optional
            Number of parallel processes to use in calculations. Iterated over `variables`
            (default is 1, which means no parallel computation).
        verbose : bool, optional
            Print tqdm progress bar (default is True).

        Returns
        -----------
        CeterisParibus class object
            Explanation object containing the main result attribute and the plot method.

        Notes
        --------
        https://pbiecek.github.io/ema/ceterisParibus.html
        """

        types = ('ceteris_paribus', )
        type = check_method_type(type, types)

        if type == 'ceteris_paribus':
            predict_profile_ = CeterisParibus(
                variables=variables,
                grid_points=grid_points,
                variable_splits=variable_splits,
                variable_splits_type=variable_splits_type,
                variable_splits_with_obs=variable_splits_with_obs,
                processes=processes)

        predict_profile_.fit(self, new_observation, y, verbose)

        return predict_profile_