def infer_target_cfs_class(self, desired_class_input, original_pred, num_output_nodes): """ Infer the target class for generating CFs. Only called when model_type=="classifier". TODO: Add support for opposite desired class in multiclass. Downstream methods should decide whether it is allowed or not. """ if desired_class_input == "opposite": if num_output_nodes == 2: original_pred_1 = np.argmax(original_pred) target_class = int(1 - original_pred_1) return target_class elif num_output_nodes > 2: raise UserConfigValidationException( "Desired class cannot be opposite if the number of classes is more than 2." ) elif isinstance(desired_class_input, int): if desired_class_input >= 0 and desired_class_input < num_output_nodes: target_class = desired_class_input return target_class else: raise UserConfigValidationException( "Desired class not present in training data!") else: raise UserConfigValidationException( "The target class for {0} could not be identified".format( desired_class_input))
def _validate_serialization_version(version): if version is None: raise UserConfigValidationException( "No version field in the json input") elif not _check_supported_json_output_versions(version): raise UserConfigValidationException( "Incompatible version {} found in json input".format(version))
def decide(backend): """Decides the Model implementation type.""" if backend == BackEndTypes.Sklearn: # random sampling of CFs from dice_ml.model_interfaces.base_model import BaseModel return BaseModel elif backend == BackEndTypes.Tensorflow1 or backend == BackEndTypes.Tensorflow2: # Tensorflow 1 or 2 backend try: import tensorflow except ImportError: raise UserConfigValidationException( "Unable to import tensorflow. Please install tensorflow") from dice_ml.model_interfaces.keras_tensorflow_model import KerasTensorFlowModel return KerasTensorFlowModel elif backend == BackEndTypes.Pytorch: # PyTorch backend try: import torch except ImportError: raise UserConfigValidationException( "Unable to import torch. Please install torch") from dice_ml.model_interfaces.pytorch_model import PyTorchModel return PyTorchModel else: # all other implementations and frameworks backend_model = backend['model'] module_name, class_name = backend_model.split('.') module = __import__("dice_ml.model_interfaces." + module_name, fromlist=[class_name]) return getattr(module, class_name)
def local_feature_importance(self, query_instances, cf_examples_list=None, total_CFs=10, desired_class="opposite", desired_range=None, permitted_range=None, features_to_vary="all", stopping_threshold=0.5, posthoc_sparsity_param=0.1, posthoc_sparsity_algorithm="linear", **kwargs): """ Estimate local feature importance scores for the given inputs. :param query_instances: A list of inputs for which to compute the feature importances. These can be provided as a dataframe. :param cf_examples_list: If precomputed, a list of counterfactual examples for every input point. If cf_examples_list is provided, then all the following parameters are ignored. :param total_CFs: The number of counterfactuals to generate per input (default is 10) :param other_parameters: These are the same as the generate_counterfactuals method. :returns: An object of class CounterfactualExplanations that includes the list of counterfactuals per input, local feature importances per input, and the global feature importance summarized over all inputs. """ self._validate_counterfactual_configuration( query_instances=query_instances, total_CFs=total_CFs, desired_class=desired_class, desired_range=desired_range, permitted_range=permitted_range, features_to_vary=features_to_vary, stopping_threshold=stopping_threshold, posthoc_sparsity_param=posthoc_sparsity_param, posthoc_sparsity_algorithm=posthoc_sparsity_algorithm, kwargs=kwargs ) if cf_examples_list is not None: if any([len(cf_examples.final_cfs_df) < 10 for cf_examples in cf_examples_list]): raise UserConfigValidationException( "The number of counterfactuals generated per query instance should be " "greater than or equal to 10 to compute feature importance for all query points") elif total_CFs < 10: raise UserConfigValidationException( "The number of counterfactuals requested per " "query instance should be greater than or equal to 10 " "to compute feature importance for all query points") importances = self.feature_importance( query_instances, cf_examples_list=cf_examples_list, total_CFs=total_CFs, local_importance=True, global_importance=False, desired_class=desired_class, desired_range=desired_range, permitted_range=permitted_range, features_to_vary=features_to_vary, stopping_threshold=stopping_threshold, posthoc_sparsity_param=posthoc_sparsity_param, posthoc_sparsity_algorithm=posthoc_sparsity_algorithm, **kwargs) return importances
def __init__(self, model=None, model_path='', backend=BackEndTypes.Tensorflow1, model_type=ModelTypes.Classifier, func=None, kw_args=None): """Init method :param model: trained ML model. :param model_path: path to trained ML model. :param backend: "TF1" ("TF2") for TensorFLow 1.0 (2.0), "PYT" for PyTorch implementations, "sklearn" for Scikit-Learn implementations of standard DiCE (https://arxiv.org/pdf/1905.07697.pdf). For all other frameworks and implementations, provide a dictionary with "model" and "explainer" as keys, and include module and class names as values in the form module_name.class_name. For instance, if there is a model interface class "XGBoostModel" in module "xgboost_model.py" inside the subpackage dice_ml.model_interfaces, and dice interface class "DiceXGBoost" in module "dice_xgboost" inside dice_ml.explainer_interfaces, then backend parameter should be {"model": "xgboost_model.XGBoostModel", "explainer": dice_xgboost.DiceXGBoost}. :param func: function transformation required for ML model. If func is None, then func will be the identity function. :param kw_args: Dictionary of additional keyword arguments to pass to func. DiCE's data_interface is appended to the dictionary of kw_args, by default. """ if backend not in BackEndTypes.ALL: warnings.warn('{0} backend not in supported backends {1}'.format( backend, ','.join(BackEndTypes.ALL)) ) if model_type not in ModelTypes.ALL: raise UserConfigValidationException('{0} model type not in supported model types {1}'.format( model_type, ','.join(ModelTypes.ALL)) ) self.model_type = model_type if model is None and model_path == '': raise ValueError("should provide either a trained model or the path to a model") else: self.decide_implementation_type(model, model_path, backend, func, kw_args)
def check_features_to_vary(self, features_to_vary): if features_to_vary is not None and features_to_vary != 'all': not_training_features = set(features_to_vary) - set( self.feature_names) if len(not_training_features) > 0: raise UserConfigValidationException( "Got features {0} which are not present in training data". format(not_training_features))
def generate_counterfactuals(self, query_instances, total_CFs, desired_class="opposite", desired_range=None, permitted_range=None, features_to_vary="all", stopping_threshold=0.5, posthoc_sparsity_param=0.1, posthoc_sparsity_algorithm="linear", verbose=False, **kwargs): """General method for generating counterfactuals. :param query_instances: Input point(s) for which counterfactuals are to be generated. This can be a dataframe with one or more rows. :param total_CFs: Total number of counterfactuals required. :param desired_class: Desired counterfactual class - can take 0 or 1. Default value is "opposite" to the outcome class of query_instance for binary classification. :param desired_range: For regression problems. Contains the outcome range to generate counterfactuals in. :param permitted_range: Dictionary with feature names as keys and permitted range in list as values. Defaults to the range inferred from training data. If None, uses the parameters initialized in data_interface. :param features_to_vary: Either a string "all" or a list of feature names to vary. :param stopping_threshold: Minimum threshold for counterfactuals target class probability. :param posthoc_sparsity_param: Parameter for the post-hoc operation on continuous features to enhance sparsity. :param posthoc_sparsity_algorithm: Perform either linear or binary search. Takes "linear" or "binary". Prefer binary search when a feature range is large (for instance, income varying from 10k to 1000k) and only if the features share a monotonic relationship with predicted outcome in the model. :param verbose: Whether to output detailed messages. :param sample_size: Sampling size :param random_seed: Random seed for reproducibility :param kwargs: Other parameters accepted by specific explanation method :returns: A CounterfactualExplanations object that contains the list of counterfactual examples per query_instance as one of its attributes. """ if total_CFs <= 0: raise UserConfigValidationException( "The number of counterfactuals generated per query instance (total_CFs) should be a positive integer." ) cf_examples_arr = [] query_instances_list = [] if isinstance(query_instances, pd.DataFrame): for ix in range(query_instances.shape[0]): query_instances_list.append(query_instances[ix:(ix + 1)]) elif isinstance(query_instances, Iterable): query_instances_list = query_instances for query_instance in tqdm(query_instances_list): res = self._generate_counterfactuals( query_instance, total_CFs, desired_class=desired_class, desired_range=desired_range, permitted_range=permitted_range, features_to_vary=features_to_vary, stopping_threshold=stopping_threshold, posthoc_sparsity_param=posthoc_sparsity_param, posthoc_sparsity_algorithm=posthoc_sparsity_algorithm, verbose=verbose, **kwargs) cf_examples_arr.append(res) return CounterfactualExplanations(cf_examples_list=cf_examples_arr)
def check_permitted_range(self, permitted_range): if permitted_range is not None: permitted_range_features = list(permitted_range) not_training_features = set(permitted_range_features) - set( self.feature_names) if len(not_training_features) > 0: raise UserConfigValidationException( "Got features {0} which are not present in training data". format(not_training_features)) for feature in permitted_range_features: if feature in self.categorical_feature_names: train_categories = self.permitted_range[feature] for test_category in permitted_range[feature]: if test_category not in train_categories: raise UserConfigValidationException( 'The category {0} does not occur in the training data for feature {1}.' ' Allowed categories are {2}'.format( test_category, feature, train_categories))
def _check_any_counterfactuals_computed(self, cf_examples_arr): """Check if any counterfactuals were generated for any query point.""" no_cf_generated = True # Check if any counterfactuals were generated for any query point for cf_examples in cf_examples_arr: if cf_examples.final_cfs_df is not None and len(cf_examples.final_cfs_df) > 0: no_cf_generated = False break if no_cf_generated: raise UserConfigValidationException( "No counterfactuals found for any of the query points! Kindly check your configuration.")
def decide_implementation_type(self, data_interface, model_interface, method, **kwargs): """Decides DiCE implementation type.""" if model_interface.backend == BackEndTypes.Sklearn: if method == SamplingStrategy.KdTree and isinstance( data_interface, PrivateData): raise UserConfigValidationException( 'Private data interface is not supported with sklearn kdtree explainer' ' since kdtree explainer needs access to entire training data' ) self.__class__ = decide(model_interface, method) self.__init__(data_interface, model_interface, **kwargs)
def decide(backend): """Decides the Model implementation type. To add new implementations of Model, add the class in model_interfaces subpackage and import-and-return the class in an elif loop as shown in the below method. """ if backend == BackEndTypes.Sklearn: # random sampling of CFs from dice_ml.model_interfaces.base_model import BaseModel return BaseModel elif backend == BackEndTypes.Tensorflow1 or backend == BackEndTypes.Tensorflow2: # Tensorflow 1 or 2 backend try: import tensorflow # noqa: F401 except ImportError: raise UserConfigValidationException( "Unable to import tensorflow. Please install tensorflow") from dice_ml.model_interfaces.keras_tensorflow_model import KerasTensorFlowModel return KerasTensorFlowModel elif backend == BackEndTypes.Pytorch: # PyTorch backend try: import torch # noqa: F401 except ImportError: raise UserConfigValidationException( "Unable to import torch. Please install torch from https://pytorch.org/" ) from dice_ml.model_interfaces.pytorch_model import PyTorchModel return PyTorchModel else: # all other implementations and frameworks backend_model = backend['model'] module_name, class_name = backend_model.split('.') module = __import__("dice_ml.model_interfaces." + module_name, fromlist=[class_name]) return getattr(module, class_name)
def decide(model_interface, method): """Decides DiCE implementation type. To add new implementations of DiCE, add the class in explainer_interfaces subpackage and import-and-return the class in an elif loop as shown in the below method. """ if model_interface.backend == BackEndTypes.Sklearn: if method == SamplingStrategy.Random: # random sampling of CFs from dice_ml.explainer_interfaces.dice_random import DiceRandom return DiceRandom elif method == SamplingStrategy.Genetic: from dice_ml.explainer_interfaces.dice_genetic import DiceGenetic return DiceGenetic elif method == SamplingStrategy.KdTree: from dice_ml.explainer_interfaces.dice_KD import DiceKD return DiceKD else: raise UserConfigValidationException( "Unsupported sample strategy {0} provided. " "Please choose one of {1}, {2} or {3}".format( method, SamplingStrategy.Random, SamplingStrategy.Genetic, SamplingStrategy.KdTree)) elif model_interface.backend == BackEndTypes.Tensorflow1: # pretrained Keras Sequential model with Tensorflow 1.x backend from dice_ml.explainer_interfaces.dice_tensorflow1 import \ DiceTensorFlow1 return DiceTensorFlow1 elif model_interface.backend == BackEndTypes.Tensorflow2: # pretrained Keras Sequential model with Tensorflow 2.x backend from dice_ml.explainer_interfaces.dice_tensorflow2 import \ DiceTensorFlow2 return DiceTensorFlow2 elif model_interface.backend == BackEndTypes.Pytorch: # PyTorch backend from dice_ml.explainer_interfaces.dice_pytorch import DicePyTorch return DicePyTorch else: # all other backends backend_dice = model_interface.backend['explainer'] module_name, class_name = backend_dice.split('.') module = __import__("dice_ml.explainer_interfaces." + module_name, fromlist=[class_name]) return getattr(module, class_name)
def as_counterfactual_explanations(json_dict): """ Helper function to convert json string to a CounterfactualExplanations object. """ if 'metadata' in json_dict: version = json_dict['metadata'].get('version') if version is None: raise UserConfigValidationException( "No version field in the json input") elif not _check_supported_json_output_versions(version): raise UserConfigValidationException( "Incompatible version {} found in json input".format(version)) cf_examples_list = [] for cf_examples_str in json_dict["cf_examples_list"]: cf_examples_list.append( CounterfactualExamples.from_json(cf_examples_str)) return CounterfactualExplanations( cf_examples_list, local_importance=json_dict["local_importance"], summary_importance=json_dict["summary_importance"]) else: return json_dict
def _validate_and_set_dataframe(self, params): """Validate and set the dataframe.""" if 'dataframe' not in params: raise ValueError("dataframe not found in params") if isinstance(params['dataframe'], pd.DataFrame): self.data_df = params['dataframe'].copy() else: raise ValueError("should provide a pandas dataframe") if 'outcome_name' in params and params[ 'outcome_name'] not in self.data_df.columns.tolist(): raise UserConfigValidationException( "outcome_name {0} not found in {1}".format( params['outcome_name'], ','.join(self.data_df.columns.tolist())))
def _validate_and_set_permitted_range(self, params): """Validate and set the dictionary of permitted ranges for continuous features.""" input_permitted_range = None if 'permitted_range' in params: input_permitted_range = params['permitted_range'] if not hasattr(self, 'feature_names'): raise SystemException( 'Feature names not correctly set in public data interface') for input_permitted_range_feature_name in input_permitted_range: if input_permitted_range_feature_name not in self.feature_names: raise UserConfigValidationException( "permitted_range contains some feature names which are not part of columns in dataframe" ) self.permitted_range, _ = self.get_features_range( input_permitted_range)
def _validate_and_set_continuous_features_precision(self, params): """Validate and set the dictionary of precision for continuous features.""" if 'continuous_features_precision' in params: self.continuous_features_precision = params[ 'continuous_features_precision'] if not hasattr(self, 'feature_names'): raise SystemException( 'Feature names not correctly set in public data interface') for continuous_features_precision_feature_name in self.continuous_features_precision: if continuous_features_precision_feature_name not in self.feature_names: raise UserConfigValidationException( "continuous_features_precision contains some feature names which are not part of columns in dataframe" ) else: self.continuous_features_precision = None
def _validate_counterfactual_configuration( self, query_instances, total_CFs, desired_class="opposite", desired_range=None, permitted_range=None, features_to_vary="all", stopping_threshold=0.5, posthoc_sparsity_param=0.1, posthoc_sparsity_algorithm="linear", verbose=False, **kwargs): if total_CFs <= 0: raise UserConfigValidationException( "The number of counterfactuals generated per query instance (total_CFs) should be a positive integer.") if posthoc_sparsity_algorithm not in _PostHocSparsityTypes.ALL: raise UserConfigValidationException( 'The posthoc_sparsity_algorithm should be {0} and not {1}'.format( ' or '.join(_PostHocSparsityTypes.ALL), posthoc_sparsity_algorithm) ) if stopping_threshold < 0.0 or stopping_threshold > 1.0: raise UserConfigValidationException('The stopping_threshold should lie between {0} and {1}'.format( str(0.0), str(1.0))) if posthoc_sparsity_param is not None and (posthoc_sparsity_param < 0.0 or posthoc_sparsity_param > 1.0): raise UserConfigValidationException('The posthoc_sparsity_param should lie between {0} and {1}'.format( str(0.0), str(1.0))) if self.model is not None and self.model.model_type == ModelTypes.Classifier: if desired_range is not None: raise UserConfigValidationException( 'The desired_range parameter should not be set for classification task') if self.model is not None and self.model.model_type == ModelTypes.Regressor: if desired_range is None: raise UserConfigValidationException( 'The desired_range parameter should be set for regression task') if desired_range is not None: if len(desired_range) != 2: raise UserConfigValidationException( "The parameter desired_range needs to have two numbers in ascending order.") if desired_range[0] > desired_range[1]: raise UserConfigValidationException( "The range provided in desired_range should be in ascending order.")
def to_json(self): """ Serialize Explanations object to json. """ serialization_version = self.metadata['version'] if serialization_version == _SchemaVersions.V1: cf_examples_str_list = [] for cf_examples in self.cf_examples_list: cf_examples_str = cf_examples.to_json( serialization_version=serialization_version) cf_examples_str_list.append(cf_examples_str) entire_dict = { _CounterfactualExpV1SchemaConstants.CF_EXAMPLES_LIST: cf_examples_str_list, _CounterfactualExpV1SchemaConstants.LOCAL_IMPORTANCE: self.local_importance, _CounterfactualExpV1SchemaConstants.SUMMARY_IMPORTANCE: self.summary_importance, _CounterfactualExpV1SchemaConstants.METADATA: self.metadata } CounterfactualExplanations._check_cf_exp_output_against_json_schema( entire_dict, version=serialization_version) return json.dumps(entire_dict) elif serialization_version == _SchemaVersions.V2: combined_test_instance_list = [] combined_final_cfs_list = [] data_interface = None feature_names = None feature_names_including_target = None model_type = None desired_class = None desired_range = None for cf_examples in self.cf_examples_list: cf_examples_str = cf_examples.to_json( serialization_version=serialization_version) # We need to load the json again since we need to decompose the # counterfactual example into different schema fields serialized_cf_examples = json.loads(cf_examples_str) combined_test_instance_list.append(serialized_cf_examples[ _DiverseCFV2SchemaConstants.TEST_INSTANCE_LIST]) combined_final_cfs_list.append(serialized_cf_examples[ _DiverseCFV2SchemaConstants.FIANL_CFS_LIST]) data_interface = serialized_cf_examples[ _DiverseCFV2SchemaConstants.DATA_INTERFACE] feature_names = serialized_cf_examples[ _DiverseCFV2SchemaConstants.FEATURE_NAMES] feature_names_including_target = serialized_cf_examples[ _DiverseCFV2SchemaConstants.FEATURE_NAMES_INCLUDING_TARGET] model_type = serialized_cf_examples[ _DiverseCFV2SchemaConstants.MODEL_TYPE] desired_class = serialized_cf_examples[ _DiverseCFV2SchemaConstants.DESIRED_CLASS] desired_range = serialized_cf_examples[ _DiverseCFV2SchemaConstants.DESIRED_RANGE] local_importance_matrix = None if self.local_importance is not None: local_importance_matrix = [] for local_importance_dict in self.local_importance: local_importance_list = [] for feature_name in feature_names: local_importance_list.append( local_importance_dict.get(feature_name)) local_importance_matrix.append(local_importance_list) summary_importance_list = None if self.summary_importance is not None: summary_importance_list = [] for feature_name in feature_names: summary_importance_list.append( self.summary_importance.get(feature_name)) entire_dict = { _CounterfactualExpV2SchemaConstants.TEST_DATA: combined_test_instance_list, _CounterfactualExpV2SchemaConstants.CFS_LIST: combined_final_cfs_list, _CounterfactualExpV2SchemaConstants.LOCAL_IMPORTANCE: local_importance_matrix, _CounterfactualExpV2SchemaConstants.SUMMARY_IMPORTANCE: summary_importance_list, _CounterfactualExpV2SchemaConstants.DATA_INTERFACE: data_interface, _CounterfactualExpV2SchemaConstants.FEATURE_NAMES: feature_names, _CounterfactualExpV2SchemaConstants.FEATURE_NAMES_INCLUDING_TARGET: feature_names_including_target, _CounterfactualExpV2SchemaConstants.MODEL_TYPE: model_type, _CounterfactualExpV2SchemaConstants.DESIRED_CLASS: desired_class, _CounterfactualExpV2SchemaConstants.DESIRED_RANGE: desired_range, _CounterfactualExpV1SchemaConstants.METADATA: self.metadata } CounterfactualExplanations._check_cf_exp_output_against_json_schema( entire_dict, version=serialization_version) return json.dumps(entire_dict) else: raise UserConfigValidationException( "Unsupported serialization version {}".format( serialization_version))
def __init__(self, params): """Init method :param dataframe: The train dataframe used by explainer method. :param continuous_features: List of names of continuous features. The remaining features are categorical features. :param outcome_name: Outcome feature name. :param permitted_range (optional): Dictionary with feature names as keys and permitted range in list as values. Defaults to the range inferred from training data. :param continuous_features_precision (optional): Dictionary with feature names as keys and precisions as values. :param data_name (optional): Dataset name """ if isinstance(params['dataframe'], pd.DataFrame): self.data_df = params['dataframe'] else: raise ValueError("should provide a pandas dataframe") if type(params['continuous_features']) is list: self.continuous_feature_names = params['continuous_features'] else: raise ValueError( "should provide the name(s) of continuous features in the data as a list" ) if type(params['outcome_name']) is str: self.outcome_name = params['outcome_name'] else: raise ValueError( "should provide the name of outcome feature as a string") if params['outcome_name'] not in self.data_df.columns.tolist(): raise UserConfigValidationException( "outcome_name {0} not found in {1}".format( params['outcome_name'], ','.join(self.data_df.columns.tolist()))) self.feature_names = [ name for name in self.data_df.columns.tolist() if name != self.outcome_name ] self.number_of_features = len(self.feature_names) if len(set(self.continuous_feature_names) - set(self.feature_names)) != 0: raise UserConfigValidationException( "continuous_features contains some feature names which are not part of columns in dataframe" ) self.categorical_feature_names = [ name for name in self.data_df.columns.tolist() if name not in self.continuous_feature_names + [self.outcome_name] ] self.continuous_feature_indexes = [ self.data_df.columns.get_loc(name) for name in self.continuous_feature_names if name in self.data_df ] self.categorical_feature_indexes = [ self.data_df.columns.get_loc(name) for name in self.categorical_feature_names if name in self.data_df ] if 'continuous_features_precision' in params: self.continuous_features_precision = params[ 'continuous_features_precision'] for continuous_features_precision_feature_name in self.continuous_features_precision: if continuous_features_precision_feature_name not in self.feature_names: raise UserConfigValidationException( "continuous_features_precision contains some feature names which are not part of columns in dataframe" ) else: self.continuous_features_precision = None if len(self.categorical_feature_names) > 0: for feature in self.categorical_feature_names: self.data_df[feature] = self.data_df[feature].apply(str) self.data_df[self.categorical_feature_names] = self.data_df[ self.categorical_feature_names].astype('category') if len(self.continuous_feature_names) > 0: for feature in self.continuous_feature_names: if self.get_data_type(feature) == 'float': self.data_df[feature] = self.data_df[feature].astype( np.float32) else: self.data_df[feature] = self.data_df[feature].astype( np.int32) # should move the below snippet to gradient based dice interfaces # self.one_hot_encoded_data = self.one_hot_encode_data(self.data_df) # self.ohe_encoded_feature_names = [x for x in self.one_hot_encoded_data.columns.tolist( # ) if x not in np.array([self.outcome_name])] # should move the below snippet to model agnostic dice interfaces # # Initializing a label encoder to obtain label-encoded values for categorical variables # self.labelencoder = {} # # self.label_encoded_data = self.data_df.copy() # # for column in self.categorical_feature_names: # self.labelencoder[column] = LabelEncoder() # self.label_encoded_data[column] = self.labelencoder[column].fit_transform(self.data_df[column]) input_permitted_range = None if 'permitted_range' in params: input_permitted_range = params['permitted_range'] for input_permitted_range_feature_name in input_permitted_range: if input_permitted_range_feature_name not in self.feature_names: raise UserConfigValidationException( "permitted_range contains some feature names which are not part of columns in dataframe" ) self.permitted_range, feature_ranges_orig = self.get_features_range( input_permitted_range) # should move the below snippet to model agnostic dice interfaces # self.max_range = -np.inf # for feature in self.continuous_feature_names: # self.max_range = max(self.max_range, self.permitted_range[feature][1]) if 'data_name' in params: self.data_name = params['data_name'] else: self.data_name = 'mydata'
def __init__(self, params): """Init method :param dataframe: The train dataframe used by explainer method. :param continuous_features: List of names of continuous features. The remaining features are categorical features. :param outcome_name: Outcome feature name. :param permitted_range (optional): Dictionary with feature names as keys and permitted range in list as values. Defaults to the range inferred from training data. :param continuous_features_precision (optional): Dictionary with feature names as keys and precisions as values. :param data_name (optional): Dataset name """ self._validate_and_set_outcome_name(params=params) self._validate_and_set_dataframe(params=params) self._validate_and_set_continuous_features(params=params) self.feature_names = [ name for name in self.data_df.columns.tolist() if name != self.outcome_name ] self.number_of_features = len(self.feature_names) if len(set(self.continuous_feature_names) - set(self.feature_names)) != 0: raise UserConfigValidationException( "continuous_features contains some feature names which are not part of columns in dataframe" ) self.categorical_feature_names = [ name for name in self.data_df.columns.tolist() if name not in self.continuous_feature_names + [self.outcome_name] ] self.categorical_feature_indexes = [ self.data_df.columns.get_loc(name) for name in self.categorical_feature_names if name in self.data_df ] self._validate_and_set_continuous_features_precision(params=params) if len(self.categorical_feature_names) > 0: for feature in self.categorical_feature_names: self.data_df[feature] = self.data_df[feature].apply(str) self.data_df[self.categorical_feature_names] = self.data_df[ self.categorical_feature_names].astype('category') if len(self.continuous_feature_names) > 0: for feature in self.continuous_feature_names: if self.get_data_type(feature) == 'float': self.data_df[feature] = self.data_df[feature].astype( np.float32) else: self.data_df[feature] = self.data_df[feature].astype( np.int32) # should move the below snippet to gradient based dice interfaces # self.one_hot_encoded_data = self.one_hot_encode_data(self.data_df) # self.ohe_encoded_feature_names = [x for x in self.one_hot_encoded_data.columns.tolist( # ) if x not in np.array([self.outcome_name])] # should move the below snippet to model agnostic dice interfaces # # Initializing a label encoder to obtain label-encoded values for categorical variables # self.labelencoder = {} # # self.label_encoded_data = self.data_df.copy() # # for column in self.categorical_feature_names: # self.labelencoder[column] = LabelEncoder() # self.label_encoded_data[column] = self.labelencoder[column].fit_transform(self.data_df[column]) self._validate_and_set_permitted_range(params=params) # should move the below snippet to model agnostic dice interfaces # self.max_range = -np.inf # for feature in self.continuous_feature_names: # self.max_range = max(self.max_range, self.permitted_range[feature][1]) self._validate_and_set_data_name(params=params)
def from_json(json_str): """ Deserialize json string to a CounterfactualExplanations object. """ json_dict = json.loads(json_str) if _CommonSchemaConstants.METADATA in json_dict: version = json_dict[_CommonSchemaConstants.METADATA].get('version') if version is None: raise UserConfigValidationException("No version field in the json input") elif not _check_supported_json_output_versions(version): raise UserConfigValidationException("Incompatible version {} found in json input".format(version)) if version == _SchemaVersions.V1: CounterfactualExplanations._check_cf_exp_output_against_json_schema( json_dict, version=version) cf_examples_list = [] for cf_examples_str in json_dict[_CounterfactualExpV1SchemaConstants.CF_EXAMPLES_LIST]: cf_examples_list.append(CounterfactualExamples.from_json(cf_examples_str)) return CounterfactualExplanations( cf_examples_list=cf_examples_list, local_importance=json_dict[_CounterfactualExpV1SchemaConstants.LOCAL_IMPORTANCE], summary_importance=json_dict[_CounterfactualExpV1SchemaConstants.SUMMARY_IMPORTANCE], version=version) elif version == _SchemaVersions.V2: CounterfactualExplanations._check_cf_exp_output_against_json_schema( json_dict, version=version) cf_examples_list = [] for index in range(0, len(json_dict[_CounterfactualExpV2SchemaConstants.CFS_LIST])): # We need to save the json again since we need to recompose the # counterfactual example. cf_examples_str = json.dumps( { _DiverseCFV2SchemaConstants.FIANL_CFS_LIST: json_dict[ _CounterfactualExpV2SchemaConstants.CFS_LIST][index], _DiverseCFV2SchemaConstants.TEST_INSTANCE_LIST: json_dict[ _CounterfactualExpV2SchemaConstants.TEST_DATA][index], _DiverseCFV2SchemaConstants.DATA_INTERFACE: json_dict[ _CounterfactualExpV2SchemaConstants.DATA_INTERFACE], _DiverseCFV2SchemaConstants.DESIRED_CLASS: json_dict[ _CounterfactualExpV2SchemaConstants.DESIRED_CLASS], _DiverseCFV2SchemaConstants.DESIRED_RANGE: json_dict[ _CounterfactualExpV2SchemaConstants.DESIRED_RANGE], _DiverseCFV2SchemaConstants.MODEL_TYPE: json_dict[ _CounterfactualExpV2SchemaConstants.MODEL_TYPE], _DiverseCFV2SchemaConstants.FEATURE_NAMES_INCLUDING_TARGET: json_dict[ _CounterfactualExpV2SchemaConstants.FEATURE_NAMES_INCLUDING_TARGET] } ) cf_examples_list.append( CounterfactualExamples.from_json(cf_examples_str) ) local_importance_list = None if json_dict[_CounterfactualExpV2SchemaConstants.LOCAL_IMPORTANCE] is not None: local_importance_list = [] for local_importance_instance in json_dict[ _CounterfactualExpV2SchemaConstants.LOCAL_IMPORTANCE]: local_importance_dict = {} feature_names = json_dict[_CounterfactualExpV2SchemaConstants.FEATURE_NAMES] for index in range(0, len(local_importance_instance)): local_importance_dict[feature_names[index]] = local_importance_instance[index] local_importance_list.append(local_importance_dict) summary_importance_dict = None if json_dict[_CounterfactualExpV2SchemaConstants.SUMMARY_IMPORTANCE] is not None: summary_importance_dict = {} feature_names = json_dict[ _CounterfactualExpV2SchemaConstants.FEATURE_NAMES] for index in range(0, len(json_dict[ _CounterfactualExpV2SchemaConstants.SUMMARY_IMPORTANCE])): summary_importance_dict[feature_names[index]] = json_dict[ _CounterfactualExpV2SchemaConstants.SUMMARY_IMPORTANCE][index] return CounterfactualExplanations( cf_examples_list=cf_examples_list, local_importance=local_importance_list, summary_importance=summary_importance_dict, version=version) else: return json_dict