Esempio n. 1
0
 def infer_target_cfs_class(self, desired_class_input, original_pred,
                            num_output_nodes):
     """ Infer the target class for generating CFs. Only called when
         model_type=="classifier".
         TODO: Add support for opposite desired class in multiclass. Downstream methods should decide
               whether it is allowed or not.
     """
     if desired_class_input == "opposite":
         if num_output_nodes == 2:
             original_pred_1 = np.argmax(original_pred)
             target_class = int(1 - original_pred_1)
             return target_class
         elif num_output_nodes > 2:
             raise UserConfigValidationException(
                 "Desired class cannot be opposite if the number of classes is more than 2."
             )
     elif isinstance(desired_class_input, int):
         if desired_class_input >= 0 and desired_class_input < num_output_nodes:
             target_class = desired_class_input
             return target_class
         else:
             raise UserConfigValidationException(
                 "Desired class not present in training data!")
     else:
         raise UserConfigValidationException(
             "The target class for {0} could not be identified".format(
                 desired_class_input))
 def _validate_serialization_version(version):
     if version is None:
         raise UserConfigValidationException(
             "No version field in the json input")
     elif not _check_supported_json_output_versions(version):
         raise UserConfigValidationException(
             "Incompatible version {} found in json input".format(version))
Esempio n. 3
0
def decide(backend):
    """Decides the Model implementation type."""

    if backend == BackEndTypes.Sklearn:  # random sampling of CFs
        from dice_ml.model_interfaces.base_model import BaseModel
        return BaseModel

    elif backend == BackEndTypes.Tensorflow1 or backend == BackEndTypes.Tensorflow2:  # Tensorflow 1 or 2 backend
        try:
            import tensorflow
        except ImportError:
            raise UserConfigValidationException(
                "Unable to import tensorflow. Please install tensorflow")
        from dice_ml.model_interfaces.keras_tensorflow_model import KerasTensorFlowModel
        return KerasTensorFlowModel

    elif backend == BackEndTypes.Pytorch:  # PyTorch backend
        try:
            import torch
        except ImportError:
            raise UserConfigValidationException(
                "Unable to import torch. Please install torch")
        from dice_ml.model_interfaces.pytorch_model import PyTorchModel
        return PyTorchModel

    else:  # all other implementations and frameworks
        backend_model = backend['model']
        module_name, class_name = backend_model.split('.')
        module = __import__("dice_ml.model_interfaces." + module_name,
                            fromlist=[class_name])
        return getattr(module, class_name)
Esempio n. 4
0
    def local_feature_importance(self, query_instances, cf_examples_list=None,
                                 total_CFs=10,
                                 desired_class="opposite", desired_range=None, permitted_range=None,
                                 features_to_vary="all", stopping_threshold=0.5,
                                 posthoc_sparsity_param=0.1, posthoc_sparsity_algorithm="linear",
                                 **kwargs):
        """ Estimate local feature importance scores for the given inputs.

        :param query_instances: A list of inputs for which to compute the
                                feature importances. These can be provided as a dataframe.
        :param cf_examples_list: If precomputed, a list of counterfactual
                                 examples for every input point. If cf_examples_list is provided, then
                                 all the following parameters are ignored.
        :param total_CFs: The number of counterfactuals to generate per input
                          (default is 10)
        :param other_parameters: These are the same as the
                                 generate_counterfactuals method.

        :returns: An object of class CounterfactualExplanations that includes
                  the list of counterfactuals per input, local feature importances per
                  input, and the global feature importance summarized over all inputs.
        """
        self._validate_counterfactual_configuration(
            query_instances=query_instances,
            total_CFs=total_CFs,
            desired_class=desired_class,
            desired_range=desired_range,
            permitted_range=permitted_range, features_to_vary=features_to_vary,
            stopping_threshold=stopping_threshold, posthoc_sparsity_param=posthoc_sparsity_param,
            posthoc_sparsity_algorithm=posthoc_sparsity_algorithm,
            kwargs=kwargs
        )
        if cf_examples_list is not None:
            if any([len(cf_examples.final_cfs_df) < 10 for cf_examples in cf_examples_list]):
                raise UserConfigValidationException(
                    "The number of counterfactuals generated per query instance should be "
                    "greater than or equal to 10 to compute feature importance for all query points")
        elif total_CFs < 10:
            raise UserConfigValidationException(
                "The number of counterfactuals requested per "
                "query instance should be greater than or equal to 10 "
                "to compute feature importance for all query points")
        importances = self.feature_importance(
            query_instances,
            cf_examples_list=cf_examples_list,
            total_CFs=total_CFs,
            local_importance=True,
            global_importance=False,
            desired_class=desired_class,
            desired_range=desired_range,
            permitted_range=permitted_range,
            features_to_vary=features_to_vary,
            stopping_threshold=stopping_threshold,
            posthoc_sparsity_param=posthoc_sparsity_param,
            posthoc_sparsity_algorithm=posthoc_sparsity_algorithm,
            **kwargs)
        return importances
Esempio n. 5
0
    def __init__(self, model=None, model_path='', backend=BackEndTypes.Tensorflow1, model_type=ModelTypes.Classifier,
                 func=None, kw_args=None):
        """Init method

        :param model: trained ML model.
        :param model_path: path to trained ML model.
        :param backend: "TF1" ("TF2") for TensorFLow 1.0 (2.0), "PYT" for PyTorch implementations,
                        "sklearn" for Scikit-Learn implementations of standard
                        DiCE (https://arxiv.org/pdf/1905.07697.pdf). For all other frameworks and
                        implementations, provide a dictionary with "model" and "explainer" as keys,
                        and include module and class names as values in the form module_name.class_name.
                        For instance, if there is a model interface class "XGBoostModel" in module "xgboost_model.py"
                        inside the subpackage dice_ml.model_interfaces, and dice interface class "DiceXGBoost"
                        in module "dice_xgboost" inside dice_ml.explainer_interfaces, then backend parameter
                        should be {"model": "xgboost_model.XGBoostModel", "explainer": dice_xgboost.DiceXGBoost}.
        :param func: function transformation required for ML model. If func is None, then func will be the identity function.
        :param kw_args: Dictionary of additional keyword arguments to pass to func. DiCE's data_interface is appended
                        to the dictionary of kw_args, by default.
        """
        if backend not in BackEndTypes.ALL:
            warnings.warn('{0} backend not in supported backends {1}'.format(
                backend, ','.join(BackEndTypes.ALL))
            )

        if model_type not in ModelTypes.ALL:
            raise UserConfigValidationException('{0} model type not in supported model types {1}'.format(
                model_type, ','.join(ModelTypes.ALL))
            )

        self.model_type = model_type
        if model is None and model_path == '':
            raise ValueError("should provide either a trained model or the path to a model")
        else:
            self.decide_implementation_type(model, model_path, backend, func, kw_args)
 def check_features_to_vary(self, features_to_vary):
     if features_to_vary is not None and features_to_vary != 'all':
         not_training_features = set(features_to_vary) - set(
             self.feature_names)
         if len(not_training_features) > 0:
             raise UserConfigValidationException(
                 "Got features {0} which are not present in training data".
                 format(not_training_features))
Esempio n. 7
0
    def generate_counterfactuals(self,
                                 query_instances,
                                 total_CFs,
                                 desired_class="opposite",
                                 desired_range=None,
                                 permitted_range=None,
                                 features_to_vary="all",
                                 stopping_threshold=0.5,
                                 posthoc_sparsity_param=0.1,
                                 posthoc_sparsity_algorithm="linear",
                                 verbose=False,
                                 **kwargs):
        """General method for generating counterfactuals.

        :param query_instances: Input point(s) for which counterfactuals are to be generated. This can be a dataframe with one or more rows.
        :param total_CFs: Total number of counterfactuals required.

        :param desired_class: Desired counterfactual class - can take 0 or 1. Default value is "opposite" to the outcome class of query_instance for binary classification.
        :param desired_range: For regression problems. Contains the outcome range to generate counterfactuals in.
        :param permitted_range: Dictionary with feature names as keys and permitted range in list as values. Defaults to the range inferred from training data. If None, uses the parameters initialized in data_interface.
        :param features_to_vary: Either a string "all" or a list of feature names to vary.
        :param stopping_threshold: Minimum threshold for counterfactuals target class probability.
        :param posthoc_sparsity_param: Parameter for the post-hoc operation on continuous features to enhance sparsity.
        :param posthoc_sparsity_algorithm: Perform either linear or binary search. Takes "linear" or "binary". Prefer binary search when a feature range is large (for instance, income varying from 10k to 1000k) and only if the features share a monotonic relationship with predicted outcome in the model.
        :param verbose: Whether to output detailed messages.
        :param sample_size: Sampling size
        :param random_seed: Random seed for reproducibility
        :param kwargs: Other parameters accepted by specific explanation method

        :returns: A CounterfactualExplanations object that contains the list of
        counterfactual examples per query_instance as one of its attributes.
        """
        if total_CFs <= 0:
            raise UserConfigValidationException(
                "The number of counterfactuals generated per query instance (total_CFs) should be a positive integer."
            )
        cf_examples_arr = []
        query_instances_list = []
        if isinstance(query_instances, pd.DataFrame):
            for ix in range(query_instances.shape[0]):
                query_instances_list.append(query_instances[ix:(ix + 1)])
        elif isinstance(query_instances, Iterable):
            query_instances_list = query_instances
        for query_instance in tqdm(query_instances_list):
            res = self._generate_counterfactuals(
                query_instance,
                total_CFs,
                desired_class=desired_class,
                desired_range=desired_range,
                permitted_range=permitted_range,
                features_to_vary=features_to_vary,
                stopping_threshold=stopping_threshold,
                posthoc_sparsity_param=posthoc_sparsity_param,
                posthoc_sparsity_algorithm=posthoc_sparsity_algorithm,
                verbose=verbose,
                **kwargs)
            cf_examples_arr.append(res)
        return CounterfactualExplanations(cf_examples_list=cf_examples_arr)
    def check_permitted_range(self, permitted_range):
        if permitted_range is not None:
            permitted_range_features = list(permitted_range)
            not_training_features = set(permitted_range_features) - set(
                self.feature_names)
            if len(not_training_features) > 0:
                raise UserConfigValidationException(
                    "Got features {0} which are not present in training data".
                    format(not_training_features))

            for feature in permitted_range_features:
                if feature in self.categorical_feature_names:
                    train_categories = self.permitted_range[feature]
                    for test_category in permitted_range[feature]:
                        if test_category not in train_categories:
                            raise UserConfigValidationException(
                                'The category {0} does not occur in the training data for feature {1}.'
                                ' Allowed categories are {2}'.format(
                                    test_category, feature, train_categories))
Esempio n. 9
0
 def _check_any_counterfactuals_computed(self, cf_examples_arr):
     """Check if any counterfactuals were generated for any query point."""
     no_cf_generated = True
     # Check if any counterfactuals were generated for any query point
     for cf_examples in cf_examples_arr:
         if cf_examples.final_cfs_df is not None and len(cf_examples.final_cfs_df) > 0:
             no_cf_generated = False
             break
     if no_cf_generated:
         raise UserConfigValidationException(
             "No counterfactuals found for any of the query points! Kindly check your configuration.")
Esempio n. 10
0
 def decide_implementation_type(self, data_interface, model_interface,
                                method, **kwargs):
     """Decides DiCE implementation type."""
     if model_interface.backend == BackEndTypes.Sklearn:
         if method == SamplingStrategy.KdTree and isinstance(
                 data_interface, PrivateData):
             raise UserConfigValidationException(
                 'Private data interface is not supported with sklearn kdtree explainer'
                 ' since kdtree explainer needs access to entire training data'
             )
     self.__class__ = decide(model_interface, method)
     self.__init__(data_interface, model_interface, **kwargs)
Esempio n. 11
0
def decide(backend):
    """Decides the Model implementation type.

    To add new implementations of Model, add the class in model_interfaces subpackage and
    import-and-return the class in an elif loop as shown in the below method.
    """
    if backend == BackEndTypes.Sklearn:
        # random sampling of CFs
        from dice_ml.model_interfaces.base_model import BaseModel
        return BaseModel

    elif backend == BackEndTypes.Tensorflow1 or backend == BackEndTypes.Tensorflow2:
        # Tensorflow 1 or 2 backend
        try:
            import tensorflow  # noqa: F401
        except ImportError:
            raise UserConfigValidationException(
                "Unable to import tensorflow. Please install tensorflow")
        from dice_ml.model_interfaces.keras_tensorflow_model import KerasTensorFlowModel
        return KerasTensorFlowModel

    elif backend == BackEndTypes.Pytorch:
        # PyTorch backend
        try:
            import torch  # noqa: F401
        except ImportError:
            raise UserConfigValidationException(
                "Unable to import torch. Please install torch from https://pytorch.org/"
            )
        from dice_ml.model_interfaces.pytorch_model import PyTorchModel
        return PyTorchModel

    else:
        # all other implementations and frameworks
        backend_model = backend['model']
        module_name, class_name = backend_model.split('.')
        module = __import__("dice_ml.model_interfaces." + module_name,
                            fromlist=[class_name])
        return getattr(module, class_name)
Esempio n. 12
0
def decide(model_interface, method):
    """Decides DiCE implementation type.

    To add new implementations of DiCE, add the class in explainer_interfaces
    subpackage and import-and-return the class in an elif loop as shown in
    the below method.
    """
    if model_interface.backend == BackEndTypes.Sklearn:
        if method == SamplingStrategy.Random:
            # random sampling of CFs
            from dice_ml.explainer_interfaces.dice_random import DiceRandom
            return DiceRandom
        elif method == SamplingStrategy.Genetic:
            from dice_ml.explainer_interfaces.dice_genetic import DiceGenetic
            return DiceGenetic
        elif method == SamplingStrategy.KdTree:
            from dice_ml.explainer_interfaces.dice_KD import DiceKD
            return DiceKD
        else:
            raise UserConfigValidationException(
                "Unsupported sample strategy {0} provided. "
                "Please choose one of {1}, {2} or {3}".format(
                    method, SamplingStrategy.Random, SamplingStrategy.Genetic,
                    SamplingStrategy.KdTree))

    elif model_interface.backend == BackEndTypes.Tensorflow1:
        # pretrained Keras Sequential model with Tensorflow 1.x backend
        from dice_ml.explainer_interfaces.dice_tensorflow1 import \
            DiceTensorFlow1
        return DiceTensorFlow1

    elif model_interface.backend == BackEndTypes.Tensorflow2:
        # pretrained Keras Sequential model with Tensorflow 2.x backend
        from dice_ml.explainer_interfaces.dice_tensorflow2 import \
            DiceTensorFlow2
        return DiceTensorFlow2

    elif model_interface.backend == BackEndTypes.Pytorch:
        # PyTorch backend
        from dice_ml.explainer_interfaces.dice_pytorch import DicePyTorch
        return DicePyTorch

    else:
        # all other backends
        backend_dice = model_interface.backend['explainer']
        module_name, class_name = backend_dice.split('.')
        module = __import__("dice_ml.explainer_interfaces." + module_name,
                            fromlist=[class_name])
        return getattr(module, class_name)
Esempio n. 13
0
def as_counterfactual_explanations(json_dict):
    """ Helper function to convert json string to a CounterfactualExplanations
    object.
    """
    if 'metadata' in json_dict:
        version = json_dict['metadata'].get('version')
        if version is None:
            raise UserConfigValidationException(
                "No version field in the json input")
        elif not _check_supported_json_output_versions(version):
            raise UserConfigValidationException(
                "Incompatible version {} found in json input".format(version))

        cf_examples_list = []
        for cf_examples_str in json_dict["cf_examples_list"]:
            cf_examples_list.append(
                CounterfactualExamples.from_json(cf_examples_str))

        return CounterfactualExplanations(
            cf_examples_list,
            local_importance=json_dict["local_importance"],
            summary_importance=json_dict["summary_importance"])
    else:
        return json_dict
Esempio n. 14
0
    def _validate_and_set_dataframe(self, params):
        """Validate and set the dataframe."""
        if 'dataframe' not in params:
            raise ValueError("dataframe not found in params")

        if isinstance(params['dataframe'], pd.DataFrame):
            self.data_df = params['dataframe'].copy()
        else:
            raise ValueError("should provide a pandas dataframe")

        if 'outcome_name' in params and params[
                'outcome_name'] not in self.data_df.columns.tolist():
            raise UserConfigValidationException(
                "outcome_name {0} not found in {1}".format(
                    params['outcome_name'],
                    ','.join(self.data_df.columns.tolist())))
Esempio n. 15
0
    def _validate_and_set_permitted_range(self, params):
        """Validate and set the dictionary of permitted ranges for continuous features."""
        input_permitted_range = None
        if 'permitted_range' in params:
            input_permitted_range = params['permitted_range']

            if not hasattr(self, 'feature_names'):
                raise SystemException(
                    'Feature names not correctly set in public data interface')

            for input_permitted_range_feature_name in input_permitted_range:
                if input_permitted_range_feature_name not in self.feature_names:
                    raise UserConfigValidationException(
                        "permitted_range contains some feature names which are not part of columns in dataframe"
                    )
        self.permitted_range, _ = self.get_features_range(
            input_permitted_range)
Esempio n. 16
0
    def _validate_and_set_continuous_features_precision(self, params):
        """Validate and set the dictionary of precision for continuous features."""
        if 'continuous_features_precision' in params:
            self.continuous_features_precision = params[
                'continuous_features_precision']

            if not hasattr(self, 'feature_names'):
                raise SystemException(
                    'Feature names not correctly set in public data interface')

            for continuous_features_precision_feature_name in self.continuous_features_precision:
                if continuous_features_precision_feature_name not in self.feature_names:
                    raise UserConfigValidationException(
                        "continuous_features_precision contains some feature names which are not part of columns in dataframe"
                    )
        else:
            self.continuous_features_precision = None
Esempio n. 17
0
    def _validate_counterfactual_configuration(
            self, query_instances, total_CFs,
            desired_class="opposite", desired_range=None,
            permitted_range=None, features_to_vary="all",
            stopping_threshold=0.5, posthoc_sparsity_param=0.1,
            posthoc_sparsity_algorithm="linear", verbose=False, **kwargs):

        if total_CFs <= 0:
            raise UserConfigValidationException(
                "The number of counterfactuals generated per query instance (total_CFs) should be a positive integer.")

        if posthoc_sparsity_algorithm not in _PostHocSparsityTypes.ALL:
            raise UserConfigValidationException(
                'The posthoc_sparsity_algorithm should be {0} and not {1}'.format(
                    ' or '.join(_PostHocSparsityTypes.ALL), posthoc_sparsity_algorithm)
                )

        if stopping_threshold < 0.0 or stopping_threshold > 1.0:
            raise UserConfigValidationException('The stopping_threshold should lie between {0} and {1}'.format(
                str(0.0), str(1.0)))

        if posthoc_sparsity_param is not None and (posthoc_sparsity_param < 0.0 or posthoc_sparsity_param > 1.0):
            raise UserConfigValidationException('The posthoc_sparsity_param should lie between {0} and {1}'.format(
                str(0.0), str(1.0)))

        if self.model is not None and self.model.model_type == ModelTypes.Classifier:
            if desired_range is not None:
                raise UserConfigValidationException(
                    'The desired_range parameter should not be set for classification task')

        if self.model is not None and self.model.model_type == ModelTypes.Regressor:
            if desired_range is None:
                raise UserConfigValidationException(
                    'The desired_range parameter should be set for regression task')

        if desired_range is not None:
            if len(desired_range) != 2:
                raise UserConfigValidationException(
                    "The parameter desired_range needs to have two numbers in ascending order.")
            if desired_range[0] > desired_range[1]:
                raise UserConfigValidationException(
                    "The range provided in desired_range should be in ascending order.")
    def to_json(self):
        """ Serialize Explanations object to json.
        """
        serialization_version = self.metadata['version']
        if serialization_version == _SchemaVersions.V1:
            cf_examples_str_list = []
            for cf_examples in self.cf_examples_list:
                cf_examples_str = cf_examples.to_json(
                    serialization_version=serialization_version)
                cf_examples_str_list.append(cf_examples_str)
            entire_dict = {
                _CounterfactualExpV1SchemaConstants.CF_EXAMPLES_LIST:
                cf_examples_str_list,
                _CounterfactualExpV1SchemaConstants.LOCAL_IMPORTANCE:
                self.local_importance,
                _CounterfactualExpV1SchemaConstants.SUMMARY_IMPORTANCE:
                self.summary_importance,
                _CounterfactualExpV1SchemaConstants.METADATA: self.metadata
            }
            CounterfactualExplanations._check_cf_exp_output_against_json_schema(
                entire_dict, version=serialization_version)
            return json.dumps(entire_dict)
        elif serialization_version == _SchemaVersions.V2:
            combined_test_instance_list = []
            combined_final_cfs_list = []
            data_interface = None
            feature_names = None
            feature_names_including_target = None
            model_type = None
            desired_class = None
            desired_range = None
            for cf_examples in self.cf_examples_list:
                cf_examples_str = cf_examples.to_json(
                    serialization_version=serialization_version)
                # We need to load the json again since we need to decompose the
                # counterfactual example into different schema fields
                serialized_cf_examples = json.loads(cf_examples_str)
                combined_test_instance_list.append(serialized_cf_examples[
                    _DiverseCFV2SchemaConstants.TEST_INSTANCE_LIST])
                combined_final_cfs_list.append(serialized_cf_examples[
                    _DiverseCFV2SchemaConstants.FIANL_CFS_LIST])
                data_interface = serialized_cf_examples[
                    _DiverseCFV2SchemaConstants.DATA_INTERFACE]
                feature_names = serialized_cf_examples[
                    _DiverseCFV2SchemaConstants.FEATURE_NAMES]
                feature_names_including_target = serialized_cf_examples[
                    _DiverseCFV2SchemaConstants.FEATURE_NAMES_INCLUDING_TARGET]
                model_type = serialized_cf_examples[
                    _DiverseCFV2SchemaConstants.MODEL_TYPE]
                desired_class = serialized_cf_examples[
                    _DiverseCFV2SchemaConstants.DESIRED_CLASS]
                desired_range = serialized_cf_examples[
                    _DiverseCFV2SchemaConstants.DESIRED_RANGE]

            local_importance_matrix = None
            if self.local_importance is not None:
                local_importance_matrix = []
                for local_importance_dict in self.local_importance:
                    local_importance_list = []
                    for feature_name in feature_names:
                        local_importance_list.append(
                            local_importance_dict.get(feature_name))
                    local_importance_matrix.append(local_importance_list)

            summary_importance_list = None
            if self.summary_importance is not None:
                summary_importance_list = []
                for feature_name in feature_names:
                    summary_importance_list.append(
                        self.summary_importance.get(feature_name))

            entire_dict = {
                _CounterfactualExpV2SchemaConstants.TEST_DATA:
                combined_test_instance_list,
                _CounterfactualExpV2SchemaConstants.CFS_LIST:
                combined_final_cfs_list,
                _CounterfactualExpV2SchemaConstants.LOCAL_IMPORTANCE:
                local_importance_matrix,
                _CounterfactualExpV2SchemaConstants.SUMMARY_IMPORTANCE:
                summary_importance_list,
                _CounterfactualExpV2SchemaConstants.DATA_INTERFACE:
                data_interface,
                _CounterfactualExpV2SchemaConstants.FEATURE_NAMES:
                feature_names,
                _CounterfactualExpV2SchemaConstants.FEATURE_NAMES_INCLUDING_TARGET:
                feature_names_including_target,
                _CounterfactualExpV2SchemaConstants.MODEL_TYPE:
                model_type,
                _CounterfactualExpV2SchemaConstants.DESIRED_CLASS:
                desired_class,
                _CounterfactualExpV2SchemaConstants.DESIRED_RANGE:
                desired_range,
                _CounterfactualExpV1SchemaConstants.METADATA:
                self.metadata
            }
            CounterfactualExplanations._check_cf_exp_output_against_json_schema(
                entire_dict, version=serialization_version)
            return json.dumps(entire_dict)
        else:
            raise UserConfigValidationException(
                "Unsupported serialization version {}".format(
                    serialization_version))
Esempio n. 19
0
    def __init__(self, params):
        """Init method

        :param dataframe: The train dataframe used by explainer method.
        :param continuous_features: List of names of continuous features. The remaining features are categorical features.
        :param outcome_name: Outcome feature name.
        :param permitted_range (optional): Dictionary with feature names as keys and permitted range in list as values. Defaults to the range inferred from training data.
        :param continuous_features_precision (optional): Dictionary with feature names as keys and precisions as values.
        :param data_name (optional): Dataset name

        """

        if isinstance(params['dataframe'], pd.DataFrame):
            self.data_df = params['dataframe']
        else:
            raise ValueError("should provide a pandas dataframe")

        if type(params['continuous_features']) is list:
            self.continuous_feature_names = params['continuous_features']
        else:
            raise ValueError(
                "should provide the name(s) of continuous features in the data as a list"
            )

        if type(params['outcome_name']) is str:
            self.outcome_name = params['outcome_name']
        else:
            raise ValueError(
                "should provide the name of outcome feature as a string")

        if params['outcome_name'] not in self.data_df.columns.tolist():
            raise UserConfigValidationException(
                "outcome_name {0} not found in {1}".format(
                    params['outcome_name'],
                    ','.join(self.data_df.columns.tolist())))

        self.feature_names = [
            name for name in self.data_df.columns.tolist()
            if name != self.outcome_name
        ]

        self.number_of_features = len(self.feature_names)

        if len(set(self.continuous_feature_names) -
               set(self.feature_names)) != 0:
            raise UserConfigValidationException(
                "continuous_features contains some feature names which are not part of columns in dataframe"
            )

        self.categorical_feature_names = [
            name for name in self.data_df.columns.tolist()
            if name not in self.continuous_feature_names + [self.outcome_name]
        ]

        self.continuous_feature_indexes = [
            self.data_df.columns.get_loc(name)
            for name in self.continuous_feature_names if name in self.data_df
        ]

        self.categorical_feature_indexes = [
            self.data_df.columns.get_loc(name)
            for name in self.categorical_feature_names if name in self.data_df
        ]

        if 'continuous_features_precision' in params:
            self.continuous_features_precision = params[
                'continuous_features_precision']
            for continuous_features_precision_feature_name in self.continuous_features_precision:
                if continuous_features_precision_feature_name not in self.feature_names:
                    raise UserConfigValidationException(
                        "continuous_features_precision contains some feature names which are not part of columns in dataframe"
                    )
        else:
            self.continuous_features_precision = None

        if len(self.categorical_feature_names) > 0:
            for feature in self.categorical_feature_names:
                self.data_df[feature] = self.data_df[feature].apply(str)
            self.data_df[self.categorical_feature_names] = self.data_df[
                self.categorical_feature_names].astype('category')

        if len(self.continuous_feature_names) > 0:
            for feature in self.continuous_feature_names:
                if self.get_data_type(feature) == 'float':
                    self.data_df[feature] = self.data_df[feature].astype(
                        np.float32)
                else:
                    self.data_df[feature] = self.data_df[feature].astype(
                        np.int32)

        # should move the below snippet to gradient based dice interfaces
        # self.one_hot_encoded_data = self.one_hot_encode_data(self.data_df)
        # self.ohe_encoded_feature_names = [x for x in self.one_hot_encoded_data.columns.tolist(
        #     ) if x not in np.array([self.outcome_name])]

        # should move the below snippet to model agnostic dice interfaces
        # # Initializing a label encoder to obtain label-encoded values for categorical variables
        # self.labelencoder = {}
        #
        # self.label_encoded_data = self.data_df.copy()
        #
        # for column in self.categorical_feature_names:
        #     self.labelencoder[column] = LabelEncoder()
        #     self.label_encoded_data[column] = self.labelencoder[column].fit_transform(self.data_df[column])

        input_permitted_range = None
        if 'permitted_range' in params:
            input_permitted_range = params['permitted_range']
            for input_permitted_range_feature_name in input_permitted_range:
                if input_permitted_range_feature_name not in self.feature_names:
                    raise UserConfigValidationException(
                        "permitted_range contains some feature names which are not part of columns in dataframe"
                    )
        self.permitted_range, feature_ranges_orig = self.get_features_range(
            input_permitted_range)

        # should move the below snippet to model agnostic dice interfaces
        # self.max_range = -np.inf
        # for feature in self.continuous_feature_names:
        #     self.max_range = max(self.max_range, self.permitted_range[feature][1])

        if 'data_name' in params:
            self.data_name = params['data_name']
        else:
            self.data_name = 'mydata'
Esempio n. 20
0
    def __init__(self, params):
        """Init method

        :param dataframe: The train dataframe used by explainer method.
        :param continuous_features: List of names of continuous features. The remaining features are categorical features.
        :param outcome_name: Outcome feature name.
        :param permitted_range (optional): Dictionary with feature names as keys and permitted range in list as values.
                                           Defaults to the range inferred from training data.
        :param continuous_features_precision (optional): Dictionary with feature names as keys and precisions as values.
        :param data_name (optional): Dataset name
        """
        self._validate_and_set_outcome_name(params=params)
        self._validate_and_set_dataframe(params=params)
        self._validate_and_set_continuous_features(params=params)

        self.feature_names = [
            name for name in self.data_df.columns.tolist()
            if name != self.outcome_name
        ]

        self.number_of_features = len(self.feature_names)

        if len(set(self.continuous_feature_names) -
               set(self.feature_names)) != 0:
            raise UserConfigValidationException(
                "continuous_features contains some feature names which are not part of columns in dataframe"
            )

        self.categorical_feature_names = [
            name for name in self.data_df.columns.tolist()
            if name not in self.continuous_feature_names + [self.outcome_name]
        ]

        self.categorical_feature_indexes = [
            self.data_df.columns.get_loc(name)
            for name in self.categorical_feature_names if name in self.data_df
        ]

        self._validate_and_set_continuous_features_precision(params=params)

        if len(self.categorical_feature_names) > 0:
            for feature in self.categorical_feature_names:
                self.data_df[feature] = self.data_df[feature].apply(str)
            self.data_df[self.categorical_feature_names] = self.data_df[
                self.categorical_feature_names].astype('category')

        if len(self.continuous_feature_names) > 0:
            for feature in self.continuous_feature_names:
                if self.get_data_type(feature) == 'float':
                    self.data_df[feature] = self.data_df[feature].astype(
                        np.float32)
                else:
                    self.data_df[feature] = self.data_df[feature].astype(
                        np.int32)

        # should move the below snippet to gradient based dice interfaces
        # self.one_hot_encoded_data = self.one_hot_encode_data(self.data_df)
        # self.ohe_encoded_feature_names = [x for x in self.one_hot_encoded_data.columns.tolist(
        #     ) if x not in np.array([self.outcome_name])]

        # should move the below snippet to model agnostic dice interfaces
        # # Initializing a label encoder to obtain label-encoded values for categorical variables
        # self.labelencoder = {}
        #
        # self.label_encoded_data = self.data_df.copy()
        #
        # for column in self.categorical_feature_names:
        #     self.labelencoder[column] = LabelEncoder()
        #     self.label_encoded_data[column] = self.labelencoder[column].fit_transform(self.data_df[column])

        self._validate_and_set_permitted_range(params=params)

        # should move the below snippet to model agnostic dice interfaces
        # self.max_range = -np.inf
        # for feature in self.continuous_feature_names:
        #     self.max_range = max(self.max_range, self.permitted_range[feature][1])

        self._validate_and_set_data_name(params=params)
    def from_json(json_str):
        """ Deserialize json string to a CounterfactualExplanations object.
        """
        json_dict = json.loads(json_str)
        if _CommonSchemaConstants.METADATA in json_dict:
            version = json_dict[_CommonSchemaConstants.METADATA].get('version')
            if version is None:
                raise UserConfigValidationException("No version field in the json input")
            elif not _check_supported_json_output_versions(version):
                raise UserConfigValidationException("Incompatible version {} found in json input".format(version))

            if version == _SchemaVersions.V1:
                CounterfactualExplanations._check_cf_exp_output_against_json_schema(
                    json_dict, version=version)
                cf_examples_list = []
                for cf_examples_str in json_dict[_CounterfactualExpV1SchemaConstants.CF_EXAMPLES_LIST]:
                    cf_examples_list.append(CounterfactualExamples.from_json(cf_examples_str))

                return CounterfactualExplanations(
                        cf_examples_list=cf_examples_list,
                        local_importance=json_dict[_CounterfactualExpV1SchemaConstants.LOCAL_IMPORTANCE],
                        summary_importance=json_dict[_CounterfactualExpV1SchemaConstants.SUMMARY_IMPORTANCE],
                        version=version)
            elif version == _SchemaVersions.V2:
                CounterfactualExplanations._check_cf_exp_output_against_json_schema(
                    json_dict, version=version)
                cf_examples_list = []
                for index in range(0, len(json_dict[_CounterfactualExpV2SchemaConstants.CFS_LIST])):
                    # We need to save the json again since we need to recompose the
                    # counterfactual example.
                    cf_examples_str = json.dumps(
                        {
                            _DiverseCFV2SchemaConstants.FIANL_CFS_LIST: json_dict[
                                _CounterfactualExpV2SchemaConstants.CFS_LIST][index],
                            _DiverseCFV2SchemaConstants.TEST_INSTANCE_LIST: json_dict[
                                _CounterfactualExpV2SchemaConstants.TEST_DATA][index],
                            _DiverseCFV2SchemaConstants.DATA_INTERFACE: json_dict[
                                _CounterfactualExpV2SchemaConstants.DATA_INTERFACE],
                            _DiverseCFV2SchemaConstants.DESIRED_CLASS: json_dict[
                                _CounterfactualExpV2SchemaConstants.DESIRED_CLASS],
                            _DiverseCFV2SchemaConstants.DESIRED_RANGE: json_dict[
                                _CounterfactualExpV2SchemaConstants.DESIRED_RANGE],
                            _DiverseCFV2SchemaConstants.MODEL_TYPE: json_dict[
                                _CounterfactualExpV2SchemaConstants.MODEL_TYPE],
                            _DiverseCFV2SchemaConstants.FEATURE_NAMES_INCLUDING_TARGET: json_dict[
                                _CounterfactualExpV2SchemaConstants.FEATURE_NAMES_INCLUDING_TARGET]
                        }
                    )
                    cf_examples_list.append(
                        CounterfactualExamples.from_json(cf_examples_str)
                    )

                local_importance_list = None
                if json_dict[_CounterfactualExpV2SchemaConstants.LOCAL_IMPORTANCE] is not None:
                    local_importance_list = []
                    for local_importance_instance in json_dict[
                            _CounterfactualExpV2SchemaConstants.LOCAL_IMPORTANCE]:
                        local_importance_dict = {}
                        feature_names = json_dict[_CounterfactualExpV2SchemaConstants.FEATURE_NAMES]
                        for index in range(0, len(local_importance_instance)):
                            local_importance_dict[feature_names[index]] = local_importance_instance[index]
                        local_importance_list.append(local_importance_dict)

                summary_importance_dict = None
                if json_dict[_CounterfactualExpV2SchemaConstants.SUMMARY_IMPORTANCE] is not None:
                    summary_importance_dict = {}
                    feature_names = json_dict[
                        _CounterfactualExpV2SchemaConstants.FEATURE_NAMES]
                    for index in range(0, len(json_dict[
                            _CounterfactualExpV2SchemaConstants.SUMMARY_IMPORTANCE])):
                        summary_importance_dict[feature_names[index]] = json_dict[
                            _CounterfactualExpV2SchemaConstants.SUMMARY_IMPORTANCE][index]

                return CounterfactualExplanations(
                        cf_examples_list=cf_examples_list,
                        local_importance=local_importance_list,
                        summary_importance=summary_importance_dict,
                        version=version)
        else:
            return json_dict