def model_profile(self, type=('partial', 'accumulated', 'conditional'), N=500, variables=None, variable_type='numerical', groups=None, span=0.25, grid_points=101, intercept=True, processes=1, random_state=None, verbose=True): """Dataset Level Variable Effect as Partial Dependency Profile or Accumulated Local Effects :param ceteris_paribus: a ceteris paribus explainer or list of ceteris paribus explainers :param N: number of observations used for calculation of partial dependency profiles. By default, 500 observations will be chosen randomly. If None then all observations will be used. :param variables: str or list or numpy.ndarray or pandas.Series, if not None then aggregate only for selected variables will be calculated, if None all will be selected :param variable_type: TODO If "numerical" then only numerical variables will be calculated. If "categorical" then only categorical variables will be calculated. :param groups: str or list or numpy.ndarray or pandas.Series, a variable names that will be used for grouping :param type: either partial/conditional/accumulated for partial dependence, conditional profiles of accumulated local effects :param span: smoothing coeffcient, by default 0.25.It's the sd for gaussian kernel :param grid_points: number of points for profile :param intercept: False if center data on 0 :param processes: integer, number of parallel processes, iterated over variables :param random_state: int, seed for random number generator :param verbose: print tqdm progress bar :return: VariableEffect object """ types = ('partial', 'accumulated', 'conditional') type = check_method_type(type, types) if N is None: N = self.data.shape[0] else: N = min(N, self.data.shape[0]) if random_state is not None: np.random.seed(random_state) I = np.random.choice(np.arange(N), N, replace=False) ceteris_paribus = CeterisParibus(grid_points=grid_points, processes=processes) ceteris_paribus.fit(self, self.data.iloc[I, :], self.y[I], verbose=verbose) model_profile_ = AggregatedProfiles(type=type, variables=variables, variable_type=variable_type, groups=groups, span=span, intercept=intercept, random_state=random_state) model_profile_.fit(ceteris_paribus, verbose) return model_profile_
def predict_profile(self, new_observation, type=('ceteris_paribus', ), y=None, variables=None, grid_points=101, variable_splits=None, processes=1): """Creates CeterisParibus object :param new_observation: DataFrame with observations for which the profiles will be calculated :param type TODO :param y: pandas Series with labels for the observations :param variables: variables for which the profiles are calculated :param grid_points: number of points in a single variable split if calculated automatically :param variable_splits: mapping of variables into points the profile will be calculated, if None then calculate with the function `_calculate_variable_splits` :param processes: integer, number of parallel processes, iterated over variables :return CeterisParibus object """ types = ('ceteris_paribus', ) type = check_method_type(type, types) if type == 'ceteris_paribus': predict_profile_ = CeterisParibus(variables=variables, grid_points=grid_points, variable_splits=variable_splits, processes=processes) predict_profile_.fit(self, new_observation, y) return predict_profile_
def model_profile(self, type=('partial', 'accumulated', 'conditional'), N=500, variables=None, variable_type='numerical', groups=None, span=0.25, grid_points=101, intercept=True): """Dataset Level Variable Effect as Partial Dependency Profile or Accumulated Local Effects :param ceteris_paribus: a ceteris paribus explainer or list of ceteris paribus explainers :param N: number of observations used for calculation of partial dependency profiles. By default, 500 observations will be chosen randomly. :param variables: names of variables for which profiles shall be calculated. :param variable_type: TODO If "numerical" then only numerical variables will be calculated. If "categorical" then only categorical variables will be calculated. :param groups: a variable name that will be used for grouping. :param type: either partial/conditional/accumulated for partial dependence, conditional profiles of accumulated local effects :param span: smoothing coeffcient, by default 0.25.It's the sd for gaussian kernel :param grid_points: number of points for profile :param intercept: False if center data on 0 :return: VariableEffect object """ types = ('partial', 'accumulated', 'conditional') type = check_method_type(type, types) N = min(N, self.data.shape[0]) I = np.random.choice(np.arange(N), N, replace=False) ceteris_paribus = CeterisParibus(grid_points=grid_points) ceteris_paribus.fit(self, self.data.iloc[I, :], self.y[I]) model_profile_ = AggregatedProfiles( type=type, variables=variables, variable_type=variable_type, groups=groups, span=span, intercept=intercept ) model_profile_.fit(ceteris_paribus) return model_profile_
def model_profile(self, type=('partial', 'accumulated', 'conditional'), N=300, variables=None, variable_type='numerical', groups=None, span=0.25, grid_points=101, variable_splits=None, center=True, processes=1, random_state=None, verbose=True): """Calculate dataset level variable profiles as Partial or Accumulated Dependence Parameters ----------- type : {'partial', 'accumulated', 'conditional'} Type of model profiles (default is 'partial' for Partial Dependence Profiles). N : int, optional Number of observations that will be sampled from the `data` attribute before the calculation of variable profiles. None means all `data` (default is 300). variables : str or array_like of str, optional Variables for which the profiles will be calculated (default is None, which means all of the variables). variable_type : {'numerical', 'categorical'} Calculate the profiles for numerical or categorical variables (default is 'numerical'). groups : str or array_like of str, optional Names of categorical variables that will be used for profile grouping (default is None, which means no grouping). span : float, optional Smoothing coefficient used as sd for gaussian kernel (default is 0.25). grid_points : int, optional Maximum number of points for profile calculations (default is 101). NOTE: The final number of points may be lower than `grid_points`, eg. if there is not enough unique values for a given variable. variable_splits : dict of lists, optional Split points for variables e.g. {'x': [0, 0.2, 0.5, 0.8, 1], 'y': ['a', 'b']} (default is None, which means that they will be distributed uniformly). center : bool, optional Theoretically Accumulated Profiles start at 0, but are centered to compare them with Partial Dependence Profiles (default is True, which means center around the average y_hat calculated on the data sample). processes : int, optional Number of parallel processes to use in calculations. Iterated over `variables` (default is 1, which means no parallel computation). random_state : int, optional Set seed for random number generator (default is random seed). verbose : bool, optional Print tqdm progress bar (default is True). Returns ----------- AggregatedProfiles class object Explanation object containing the main result attribute and the plot method. Notes -------- https://pbiecek.github.io/ema/partialDependenceProfiles.html https://pbiecek.github.io/ema/accumulatedLocalProfiles.html """ types = ('partial', 'accumulated', 'conditional') type = check_method_type(type, types) if N is None: N = self.data.shape[0] else: N = min(N, self.data.shape[0]) if random_state is not None: np.random.seed(random_state) I = np.random.choice(np.arange(N), N, replace=False) ceteris_paribus = CeterisParibus(grid_points=grid_points, variables=variables, variable_splits=variable_splits, variable_splits_type='uniform', processes=processes) ceteris_paribus.fit(self, self.data.iloc[I, :], self.y[I], verbose=verbose) model_profile_ = AggregatedProfiles(type=type, variables=variables, variable_type=variable_type, groups=groups, span=span, center=center, random_state=random_state) model_profile_.fit(ceteris_paribus, verbose) return model_profile_
def predict_profile(self, new_observation, type=('ceteris_paribus', ), y=None, variables=None, grid_points=101, variable_splits=None, variable_splits_type='uniform', variable_splits_with_obs=True, processes=1, verbose=True): """Calculate instance level variable profiles as Ceteris Paribus Parameters ----------- new_observation : pd.DataFrame or np.ndarray or pd.Series Observations for which predictions need to be explained. type : {'ceteris_paribus', TODO: 'oscilations'} Type of variable profiles (default is 'ceteris_paribus'). y : pd.Series or np.ndarray (1d), optional Target variable with the same length as `new_observation`. variables : str or array_like of str, optional Variables for which the profiles will be calculated (default is None, which means all of the variables). grid_points : int, optional Maximum number of points for profile calculations (default is 101). NOTE: The final number of points may be lower than `grid_points`, eg. if there is not enough unique values for a given variable. variable_splits : dict of lists, optional Split points for variables e.g. {'x': [0, 0.2, 0.5, 0.8, 1], 'y': ['a', 'b']} (default is None, which means that they will be calculated using one of `variable_splits_type` and the `data` attribute). variable_splits_type : {'uniform', 'quantiles'}, optional Way of calculating `variable_splits`. Set 'quantiles' for percentiles. (default is 'uniform', which means uniform grid of points). variable_splits_with_obs: bool, optional Add variable values of `new_observation` data to the `variable_splits` (default is True). processes : int, optional Number of parallel processes to use in calculations. Iterated over `variables` (default is 1, which means no parallel computation). verbose : bool, optional Print tqdm progress bar (default is True). Returns ----------- CeterisParibus class object Explanation object containing the main result attribute and the plot method. Notes -------- https://pbiecek.github.io/ema/ceterisParibus.html """ types = ('ceteris_paribus', ) type = check_method_type(type, types) if type == 'ceteris_paribus': predict_profile_ = CeterisParibus( variables=variables, grid_points=grid_points, variable_splits=variable_splits, variable_splits_type=variable_splits_type, variable_splits_with_obs=variable_splits_with_obs, processes=processes) predict_profile_.fit(self, new_observation, y, verbose) return predict_profile_