def request_local_instance_effects(self, id, X_test):
        """Get local causal effects for a given data point.

        :param id: The query id for finding the
                   causal config.
        :type id: str
        :param X_test: The data for which the local causal effects
                       needs to be generated for a given point.
        :type X_test: Any
        :return: An object of type CausalData with
                 causal effects for a given point.
        :rtype: CausalData
        """
        filtered = [r for r in self.get() if r.id == id]

        if len(filtered) == 0:
            raise ValueError(f"Failed to find causal result with ID: {id}")

        if not isinstance(X_test, pd.DataFrame):
            raise UserConfigValidationException(
                'Data is of type {0} but it must be '
                'a pandas DataFrame.'.format(type(X_test)))

        if X_test.shape[0] > 1:
            raise UserConfigValidationException(
                'Only one row of data is allowed for '
                'local causal effects.')

        result = filtered[0]
        return result._local_instance_effects(X_test)
Ejemplo n.º 2
0
    def _get_cohort_object(json_dict):
        """Method to read a JSON dictionary and return a Cohort object.

        :param json_dict: JSON dictionary containing cohort data.
        :type: dict[str, str]
        :return: The Cohort object.
        :rtype: Cohort
        """
        cohort_fields = ["name", "cohort_filter_list"]
        for cohort_field in cohort_fields:
            if cohort_field not in json_dict:
                raise UserConfigValidationException(
                    "No {0} field found for cohort deserialization".format(
                        cohort_field))

        if not isinstance(json_dict['cohort_filter_list'], list):
            raise UserConfigValidationException(
                "Field cohort_filter_list not of type list for "
                "cohort deserialization")

        deserialized_cohort = Cohort(json_dict['name'])
        for serialized_cohort_filter in json_dict['cohort_filter_list']:
            cohort_filter_fields = ["method", "arg", "column"]
            for cohort_filter_field in cohort_filter_fields:
                if cohort_filter_field not in serialized_cohort_filter:
                    raise UserConfigValidationException(
                        "No {0} field found for cohort filter "
                        "deserialization".format(cohort_filter_field))

            cohort_filter = CohortFilter(
                method=serialized_cohort_filter['method'],
                arg=serialized_cohort_filter['arg'],
                column=serialized_cohort_filter['column'])
            deserialized_cohort.add_cohort_filter(cohort_filter=cohort_filter)
        return deserialized_cohort
Ejemplo n.º 3
0
    def _add_counterfactual_config(self, new_counterfactual_config):

        if self._task_type == ModelTask.CLASSIFICATION:
            if new_counterfactual_config.desired_class is None:
                raise UserConfigValidationException(
                    'The desired_class attribute should be either \'{0}\''
                    ' or the class value for classification scenarios.'.format(
                        CounterfactualConstants.OPPOSITE))

            is_multiclass = len(
                np.unique(
                    self._train[self._target_column].values).tolist()) > 2
            if is_multiclass and \
                    new_counterfactual_config.desired_class == \
                    CounterfactualConstants.OPPOSITE:
                raise UserConfigValidationException(
                    'The desired_class attribute should not be \'{0}\''
                    ' It should be the class value for multiclass'
                    ' classification scenario.'.format(
                        CounterfactualConstants.OPPOSITE))

        if self._task_type == ModelTask.REGRESSION:
            if new_counterfactual_config.desired_range is None:
                raise UserConfigValidationException(
                    'The desired_range should not be None'
                    ' for regression scenarios.')

        is_duplicate = new_counterfactual_config.is_duplicate(
            self._counterfactual_config_list)

        if is_duplicate:
            raise DuplicateManagerConfigException(
                'Duplicate counterfactual configuration detected.')
        else:
            self._counterfactual_config_list.append(new_counterfactual_config)
    def request_explanations(self, local: bool, data: Any):
        """Return the explanations for a given point(s) .

        :param local: True if local explanations are requested
                      and False otherwise.
        :type local: bool
        :param data: The data point(s) for which the explanations
                     need to be generated.
        :type data: Any
        :return: The explanations for the given data point(s)
                 according to the interface specified by
                 ModelExplanationData.
        :rtype: ModelExplanationData
        """
        if not isinstance(data, pd.DataFrame):
            raise UserConfigValidationException(
                'Data is of type {0} but it must be '
                'a pandas DataFrame.'.format(type(data)))

        if local and data.shape[0] > 1:
            raise UserConfigValidationException(
                'Only one row of data is allowed for '
                'local explanation generation.')

        explanations = self._compute_explanations(local=local, data=data)
        return self._get_interpret(explanations, data)
Ejemplo n.º 5
0
    def request_counterfactuals(self, query_id: str, data: Any):
        """Return the counterfactuals for a given point.

        :param query_id: The query id for finding the
                         counterfactual config.
        :type query_id: str
        :param data: The data point for which the counterfactuals
                     need to be generated.
        :type data: Any
        :return: An object of type CounterfactualData with
                 counterfactuals for the given data point.
        :rtype: CounterfactualData
        """
        if not isinstance(data, pd.DataFrame):
            raise UserConfigValidationException(
                'Data is of type {0} but it must be '
                'a pandas DataFrame.'.format(type(data)))

        if data.shape[0] > 1:
            raise UserConfigValidationException(
                'Only one row of data is allowed for '
                'counterfactual generation.')

        query_cf_config = None
        for cf_config in self._counterfactual_config_list:
            if cf_config.id == query_id:
                query_cf_config = cf_config
                break

        if query_cf_config is None:
            raise UserConfigValidationException(
                'No counterfactual config found for id {0}.'.format(query_id))

        if not query_cf_config.feature_importance:
            counterfactual_obj = \
                query_cf_config.explainer.generate_counterfactuals(
                    data, total_CFs=query_cf_config.total_CFs,
                    desired_class=query_cf_config.desired_class,
                    desired_range=query_cf_config.desired_range,
                    features_to_vary=query_cf_config.features_to_vary,
                    permitted_range=query_cf_config.permitted_range)
        else:
            counterfactual_obj = \
                query_cf_config.explainer.local_feature_importance(
                    data,
                    total_CFs=query_cf_config.total_CFs,
                    desired_class=query_cf_config.desired_class,
                    desired_range=query_cf_config.desired_range,
                    features_to_vary=query_cf_config.features_to_vary,
                    permitted_range=query_cf_config.permitted_range)

        # Validate the serialized output against schema
        schema = CounterfactualManager._get_counterfactual_schema(
            version=counterfactual_obj.metadata['version'])
        jsonschema.validate(json.loads(counterfactual_obj.to_json()), schema)

        return self._get_counterfactual(query_cf_config, counterfactual_obj)
Ejemplo n.º 6
0
    def add(self,
            max_depth: int = 3,
            num_leaves: int = 31,
            min_child_samples: int = 20,
            filter_features: Optional[List] = None):
        """Add an error analyzer to be computed later.

        :param max_depth: The maximum depth of the tree.
        :type max_depth: Optional[int]
        :param num_leaves: The number of leaves in the tree.
        :type num_leaves: Optional[int]
        :param min_child_samples: The minimal number of data required to
            create one leaf.
        :type min_child_samples: Optional[int]
        :param filter_features: One or two features to use for the
            matrix filter.
        :type filter_features: Optional[list]
        """
        if self._analyzer.model is None:
            raise UserConfigValidationException(
                'Model is required for error analysis')

        ea_config = ErrorAnalysisConfig(max_depth=max_depth,
                                        num_leaves=num_leaves,
                                        min_child_samples=min_child_samples,
                                        filter_features=filter_features)
        is_duplicate = ea_config.is_duplicate(self._ea_config_list)

        if is_duplicate:
            raise DuplicateManagerConfigException(
                "Duplicate config specified for error analysis,"
                "config already added")
        else:
            self._ea_config_list.append(ea_config)
    def _validate_cohort_list(self, cohort_list=None):
        if cohort_list is None:
            return

        if not isinstance(cohort_list, list):
            raise UserConfigValidationException(
                "cohort_list parameter should be a list.")

        if not all(isinstance(entry, Cohort) for entry in cohort_list):
            raise UserConfigValidationException(
                "All entries in cohort_list should be of type Cohort.")

        all_cohort_names = [cohort.name for cohort in cohort_list]
        unique_cohort_names = np.unique(all_cohort_names).tolist()

        if len(unique_cohort_names) != len(all_cohort_names):
            raise UserConfigValidationException(
                "Found cohorts with duplicate names. "
                "All pre-defined cohorts need to have distinct names.")

        test_data = pd.DataFrame(
            data=self.dashboard_input.dataset.features,
            columns=self.dashboard_input.dataset.feature_names)
        if self.dashboard_input.dataset.task_type == \
                ModelTask.CLASSIFICATION:
            class_names_list = self.dashboard_input.dataset.class_names
            true_y_array = self.dashboard_input.dataset.true_y
            true_class_array = np.array(
                [class_names_list[index] for index in true_y_array])
            test_data[self.dashboard_input.dataset.target_column] = \
                true_class_array
        else:
            test_data[self.dashboard_input.dataset.target_column] = \
                self.dashboard_input.dataset.true_y

        categorical_features = \
            self.dashboard_input.dataset.categorical_features
        for cohort in cohort_list:
            cohort._validate_with_test_data(
                test_data=test_data,
                target_column=self.dashboard_input.dataset.target_column,
                categorical_features=categorical_features,
                is_classification=self._is_classifier)
Ejemplo n.º 8
0
 def __init__(self, name: str):
     """Defines the cohort which will be injected from SDK into the Dashboard.
     :param name: Name of the cohort.
     :type name: str
     """
     if not isinstance(name, str):
         raise UserConfigValidationException(
             "Got unexpected type {0} for cohort name. "
             "Expected string type.".format(type(name)))
     self.name = name
     self.cohort_filter_list = None
Ejemplo n.º 9
0
    def test_user_exceptions(self):
        ucve = UserConfigValidationException(TEST_MESSAGE)
        self._verify_exception_hierarchy(ucve, UserErrorException(),
                                         'Invalid config', TEST_MESSAGE)

        de = DuplicateManagerConfigException(TEST_MESSAGE)
        self._verify_exception_hierarchy(
            de, UserErrorException(), 'Duplicate RAI configuration detected',
            TEST_MESSAGE)

        uee = UserErrorException(TEST_MESSAGE)
        self._verify_exception_hierarchy(uee, UserErrorException(),
                                         'User Error', TEST_MESSAGE)
Ejemplo n.º 10
0
 def add_cohort_filter(self, cohort_filter: CohortFilter):
     """Add a cohort filter into the cohort.
     :param cohort_filter: Cohort filter defined by CohortFilter class.
     :type: CohortFilter
     """
     if not isinstance(cohort_filter, CohortFilter):
         raise UserConfigValidationException(
             "Got unexpected type {0} for cohort filter. "
             "Expected CohortFilter type.".format(type(cohort_filter)))
     if self.cohort_filter_list is None:
         self.cohort_filter_list = [cohort_filter]
     else:
         self.cohort_filter_list.append(cohort_filter)
Ejemplo n.º 11
0
    def _validate_with_test_data(self,
                                 test_data: pd.DataFrame,
                                 target_column: str,
                                 categorical_features: List[str],
                                 is_classification: Optional[bool] = True):
        """
        Validate the cohort and cohort filters parameters with respect to
        test data.

        :param test_data: Test data over which cohort analysis will be done
            in ResponsibleAI Dashboard.
        :type test_data: pd.DataFrame
        :param target_column: The target column in the test data.
        :type target_column: str
        :param categorical_features: The categorical feature names.
        :type categorical_features: list[str]
        :param is_classification: True to indicate if this validation needs
            to be done for a classification scenario and False to indicate
            that this needs to be done for regression scenario.
        :type is_classification: bool
        """
        if self.cohort_filter_list is None:
            return
        if not isinstance(test_data, pd.DataFrame):
            raise UserConfigValidationException(
                "The test_data should be a pandas DataFrame.")
        if not isinstance(target_column, str):
            raise UserConfigValidationException(
                "The target_column should be string.")
        if not isinstance(categorical_features, list):
            raise UserConfigValidationException(
                "Expected a list type for categorical columns.")
        for categorical_feature in categorical_features:
            if not isinstance(categorical_feature, str):
                raise UserConfigValidationException(
                    "Feature {0} in categorical_features need to be of "
                    "string type.".format(categorical_feature))

        if target_column not in test_data.columns:
            raise UserConfigValidationException(
                "The target_column {0} was not found in test_data.".format(
                    target_column))

        test_data_columns_set = set(test_data.columns) - set([target_column])
        for categorical_feature in categorical_features:
            if categorical_feature not in test_data_columns_set:
                raise UserConfigValidationException(
                    "Found categorical feature {0} which is not"
                    " present in test data.".format(categorical_feature))

        for cohort_filter in self.cohort_filter_list:
            cohort_filter._validate_with_test_data(
                test_data=test_data,
                target_column=target_column,
                categorical_features=categorical_features,
                is_classification=is_classification)
Ejemplo n.º 12
0
    def validate_feature_metadata_with_user_features(self,
                                                     user_features: Optional[
                                                         List[str]] = None):
        """Validate the feature metadata with the user features.

        :param user_features: List of features in the user input dataset.
        :type user_features: Optional[List[str]]
        """
        if user_features is None:
            return
        if self.identity_feature_name is not None:
            if self.identity_feature_name not in user_features:
                raise UserConfigValidationException(
                    'The given identity feature name {0} is not present'
                    ' in user features.'.format(self.identity_feature_name))
    def add(self):
        """Add an explainer to be computed later."""
        if self._model is None:
            raise UserConfigValidationException(
                'Model is required for model explanations')

        if self._is_added:
            warnings.warn(("DUPLICATE-EXPLAINER-CONFIG: Ignoring. "
                           "Explanation has already been added, "
                           "currently limited to one explainer type."),
                          UserWarning)
            return

        self._initialize_surrogate_model()
        self._is_added = True
    def _validate_features_same(self, small_train_features_before,
                                small_train_data, function):
        """
        Validate the features are unmodified on the DataFrame.

        :param small_train_features_before: The features saved before
            an operation was performed.
        :type small_train_features_before: list[str]
        :param small_train_data: The DataFrame after the operation.
        :type small_train_data: pandas.DataFrame
        :param function: The name of the operation performed.
        :type function: str
        """
        small_train_features_after = list(small_train_data.columns)
        if small_train_features_before != small_train_features_after:
            raise UserConfigValidationException(
                ('Calling model {} function modifies '
                 'input dataset features. Please check if '
                 'predict function is defined correctly.').format(function))
def validate_train_test_categories(
    train_data: pd.DataFrame,
    test_data: pd.DataFrame,
    rai_compute_type: str,
    categoricals: Optional[List[str]] = None,
):
    if categoricals is None:
        return
    discovered = {}
    for column in categoricals:
        if column in train_data.columns:
            train_unique = np.unique(train_data[column])
            test_unique = np.unique(test_data[column])
            difference = np.setdiff1d(test_unique, train_unique)
            if difference.shape[0] != 0:
                discovered[column] = difference.tolist()
    if len(discovered) > 0:
        message = ("{} requires that every category of "
                   "categorical features present in the test data "
                   "be also present in the train data. "
                   "Categories missing from train data: {}")
        raise UserConfigValidationException(
            message.format(rai_compute_type, discovered))
Ejemplo n.º 16
0
    def _validate_rai_insights_input_parameters(
            self, model: Any, train: pd.DataFrame, test: pd.DataFrame,
            target_column: str, task_type: str,
            categorical_features: List[str], classes: np.ndarray,
            serializer,
            maximum_rows_for_test: int):
        """Validate the inputs for the RAIInsights constructor.

        :param model: The model to compute RAI insights for.
            A model that implements sklearn.predict or sklearn.predict_proba
            or function that accepts a 2d ndarray.
        :type model: object
        :param train: The training dataset including the label column.
        :type train: pandas.DataFrame
        :param test: The test dataset including the label column.
        :type test: pandas.DataFrame
        :param target_column: The name of the label column.
        :type target_column: str
        :param task_type: The task to run, can be `classification` or
            `regression`.
        :type task_type: str
        :param categorical_features: The categorical feature names.
        :type categorical_features: list[str]
        :param classes: The class labels in the training dataset
        :type classes: numpy.ndarray
        :param serializer: Picklable custom serializer with save and load
            methods defined for model that is not serializable. The save
            method returns a dictionary state and load method returns the
            model.
        :type serializer: object
        :param maximum_rows_for_test: Limit on size of test data
            (for performance reasons)
        :type maximum_rows_for_test: int
        """

        valid_tasks = [
            ModelTask.CLASSIFICATION.value,
            ModelTask.REGRESSION.value
        ]
        if task_type not in valid_tasks:
            message = (f"Unsupported task type '{task_type}'. "
                       f"Should be one of {valid_tasks}")
            raise UserConfigValidationException(message)

        if model is None:
            warnings.warn(
                'INVALID-MODEL-WARNING: No valid model is supplied. '
                'The explanations, error analysis and counterfactuals '
                'may not work')
            if serializer is not None:
                raise UserConfigValidationException(
                    'No valid model is specified but model '
                    'serializer provided.')

        if serializer is not None:
            if not hasattr(serializer, 'save'):
                raise UserConfigValidationException(
                    'The serializer does not implement save()')

            if not hasattr(serializer, 'load'):
                raise UserConfigValidationException(
                    'The serializer does not implement load()')

            try:
                pickle.dumps(serializer)
            except Exception:
                raise UserConfigValidationException(
                    'The serializer should be serializable via pickle')

        if isinstance(train, pd.DataFrame) and isinstance(test, pd.DataFrame):
            if test.shape[0] > maximum_rows_for_test:
                msg_fmt = 'The test data has {0} rows, ' +\
                    'but limit is set to {1} rows. ' +\
                    'Please resample the test data or ' +\
                    'adjust maximum_rows_for_test'
                raise UserConfigValidationException(
                    msg_fmt.format(
                        test.shape[0], maximum_rows_for_test)
                )

            if len(set(train.columns) - set(test.columns)) != 0 or \
                    len(set(test.columns) - set(train.columns)):
                raise UserConfigValidationException(
                    'The features in train and test data do not match')

            if target_column not in list(train.columns) or \
                    target_column not in list(test.columns):
                raise UserConfigValidationException(
                    'Target name {0} not present in train/test data'.format(
                        target_column)
                )

            if categorical_features is not None and \
                    len(categorical_features) > 0:
                if target_column in categorical_features:
                    raise UserConfigValidationException(
                        'Found target name {0} in '
                        'categorical feature list'.format(
                            target_column)
                    )

                difference_set = set(categorical_features) - set(train.columns)
                if len(difference_set) > 0:
                    message = ("Feature names in categorical_features "
                               "do not exist in train data: "
                               f"{list(difference_set)}")
                    raise UserConfigValidationException(message)

                for column in categorical_features:
                    try:
                        np.unique(train[column])
                    except Exception:
                        raise UserConfigValidationException(
                            "Error finding unique values in column {0}. "
                            "Please check your train data.".format(column)
                        )

                    try:
                        np.unique(test[column])
                    except Exception:
                        raise UserConfigValidationException(
                            "Error finding unique values in column {0}. "
                            "Please check your test data.".format(column)
                        )

            if classes is not None and task_type == \
                    ModelTask.CLASSIFICATION:
                if len(set(train[target_column].unique()) -
                       set(classes)) != 0 or \
                        len(set(classes) -
                            set(train[target_column].unique())) != 0:
                    raise UserConfigValidationException(
                        'The train labels and distinct values in '
                        'target (train data) do not match')

                if len(set(test[target_column].unique()) -
                       set(classes)) != 0 or \
                        len(set(classes) -
                            set(test[target_column].unique())) != 0:
                    raise UserConfigValidationException(
                        'The train labels and distinct values in '
                        'target (test data) do not match')

            if model is not None:
                # Pick one row from train and test data
                small_train_data = train.iloc[0:1].drop(
                    [target_column], axis=1)
                small_test_data = test.iloc[0:1].drop(
                    [target_column], axis=1)

                small_train_features_before = list(small_train_data.columns)

                # Run predict() of the model
                try:
                    model.predict(small_train_data)
                    model.predict(small_test_data)
                except Exception:
                    raise UserConfigValidationException(
                        'The model passed cannot be used for'
                        ' getting predictions via predict()'
                    )
                self._validate_features_same(small_train_features_before,
                                             small_train_data,
                                             SKLearn.PREDICT)

                # Run predict_proba() of the model
                if task_type == ModelTask.CLASSIFICATION:
                    try:
                        model.predict_proba(small_train_data)
                        model.predict_proba(small_test_data)
                    except Exception:
                        raise UserConfigValidationException(
                            'The model passed cannot be used for'
                            ' getting predictions via predict_proba()'
                        )
                self._validate_features_same(small_train_features_before,
                                             small_train_data,
                                             SKLearn.PREDICT_PROBA)

                if task_type == ModelTask.REGRESSION:
                    if hasattr(model, SKLearn.PREDICT_PROBA):
                        raise UserConfigValidationException(
                            'The regression model'
                            'provided has a predict_proba function. '
                            'Please check the task_type.')
        else:
            raise UserConfigValidationException(
                "Unsupported data type for either train or test. "
                "Expecting pandas DataFrame for train and test."
            )
Ejemplo n.º 17
0
    def _add_counterfactual_config(self, new_counterfactual_config):
        if self._model is None:
            raise UserConfigValidationException(
                'Model is required for counterfactual example generation and '
                'feature importances')

        validate_train_test_categories(
            train_data=self._train,
            test_data=self._test,
            rai_compute_type='Counterfactual example generation',
            categoricals=self._categorical_features)

        to_vary = new_counterfactual_config.features_to_vary
        if to_vary != 'all':
            difference_set = set(to_vary) - set(self._train.columns)
            if len(difference_set) > 0:
                message = ("Feature names in features_to_vary do "
                           f"not exist in train data: {list(difference_set)}")
                raise UserConfigValidationException(message)

        if new_counterfactual_config.permitted_range is not None:
            permitted_features = \
                list(new_counterfactual_config.permitted_range)
            difference_set = set(permitted_features) - set(self._train.columns)
            if len(difference_set) > 0:
                message = ("Feature names in permitted_range do "
                           f"not exist in train data: {list(difference_set)}")
                raise UserConfigValidationException(message)

        if self._task_type == ModelTask.CLASSIFICATION:
            if new_counterfactual_config.desired_class is None:
                raise UserConfigValidationException(
                    'The desired_class attribute should be either \'{0}\''
                    ' for binary classification or the class value for '
                    'multi-classification scenarios.'.format(
                        CounterfactualConstants.OPPOSITE))

            is_multiclass = len(
                np.unique(
                    self._train[self._target_column].values).tolist()) > 2
            if is_multiclass and \
                    new_counterfactual_config.desired_class == \
                    CounterfactualConstants.OPPOSITE:
                raise UserConfigValidationException(
                    'The desired_class attribute should not be \'{0}\''
                    ' It should be the class value for multiclass'
                    ' classification scenario.'.format(
                        CounterfactualConstants.OPPOSITE))

        if self._task_type == ModelTask.REGRESSION:
            if new_counterfactual_config.desired_range is None:
                raise UserConfigValidationException(
                    'The desired_range should not be None'
                    ' for regression scenarios.')

        if new_counterfactual_config.feature_importance and\
                new_counterfactual_config.total_CFs < 10:
            raise UserConfigValidationException(
                "A total_CFs value of at least 10 is required to "
                "use counterfactual feature importances. "
                "Either increase total_CFs to at least 10 or "
                "set feature_importance to False.")

        is_duplicate = new_counterfactual_config.is_duplicate(
            self._counterfactual_config_list)

        if is_duplicate:
            raise DuplicateManagerConfigException(
                'Duplicate counterfactual configuration detected.')
        else:
            self._counterfactual_config_list.append(new_counterfactual_config)
Ejemplo n.º 18
0
    def _validate_model_analysis_input_parameters(self,
                                                  model,
                                                  train,
                                                  test,
                                                  target_column,
                                                  task_type,
                                                  categorical_features=None,
                                                  train_labels=None,
                                                  serializer=None):
        """
        Validate the inputs for ModelAnalysis class.

        :param model: The model to compute RAI insights for.
            A model that implements sklearn.predict or sklearn.predict_proba
            or function that accepts a 2d ndarray.
        :type model: object
        :param train: The training dataset including the label column.
        :type train: pandas.DataFrame
        :param test: The test dataset including the label column.
        :type test: pandas.DataFrame
        :param target_column: The name of the label column.
        :type target_column: str
        :param task_type: The task to run, can be `classification` or
            `regression`.
        :type task_type: str
        :param categorical_features: The categorical feature names.
        :type categorical_features: list[str]
        :param train_labels: The class labels in the training dataset
        :type train_labels: ndarray
        :param serializer: Picklable custom serializer with save and load
            methods defined for model that is not serializable. The save
            method returns a dictionary state and load method returns the
            model.
        :type serializer: object
        """

        if task_type != ModelTask.CLASSIFICATION and \
                task_type != ModelTask.REGRESSION:
            raise UserConfigValidationException(
                'Unsupported task type. Should be one of {0} or {1}'.format(
                    ModelTask.CLASSIFICATION, ModelTask.REGRESSION))

        if serializer is not None:
            if not hasattr(serializer, 'save'):
                raise UserConfigValidationException(
                    'The serializer does not implement save()')

            if not hasattr(serializer, 'load'):
                raise UserConfigValidationException(
                    'The serializer does not implement load()')

            try:
                pickle.dumps(serializer)
            except Exception:
                raise UserConfigValidationException(
                    'The serializer should be serializable via pickle')

        if isinstance(train, pd.DataFrame) and isinstance(test, pd.DataFrame):
            if len(set(train.columns) - set(test.columns)) != 0 or \
                    len(set(test.columns) - set(train.columns)):
                raise UserConfigValidationException(
                    'The features in train and test data do not match')

            if target_column not in list(train.columns) or \
                    target_column not in list(test.columns):
                raise UserConfigValidationException(
                    'Target name {0} not present in train/test data'.format(
                        target_column))

            if categorical_features is not None and \
                    len(categorical_features) > 0:
                if target_column in categorical_features:
                    raise UserConfigValidationException(
                        'Found target name {0} in '
                        'categorical feature list'.format(target_column))

                if not set(categorical_features).issubset(set(train.columns)):
                    raise UserConfigValidationException(
                        'Found some feature names in categorical feature which'
                        ' do not occur in train data')

            if train_labels is not None and task_type == \
                    ModelTask.CLASSIFICATION:
                if len(set(train[target_column].unique()) -
                       set(train_labels)) != 0 or \
                        len(set(train_labels) -
                            set(train[target_column].unique())) != 0:
                    raise UserConfigValidationException(
                        'The train labels and distinct values in '
                        'target (train data) do not match')

                if len(set(test[target_column].unique()) -
                       set(train_labels)) != 0 or \
                        len(set(train_labels) -
                            set(test[target_column].unique())) != 0:
                    raise UserConfigValidationException(
                        'The train labels and distinct values in '
                        'target (test data) do not match')

            if model is not None:
                # Run predict() of the model
                try:
                    small_train_data = train.iloc[0:1].drop([target_column],
                                                            axis=1)
                    small_test_data = test.iloc[0:1].drop([target_column],
                                                          axis=1)
                    model.predict(small_train_data)
                    model.predict(small_test_data)
                except Exception:
                    raise UserConfigValidationException(
                        'The model passed cannot be used for'
                        ' getting predictions via predict()')

                # Run predict_proba() of the model
                if task_type == ModelTask.CLASSIFICATION:
                    try:
                        small_train_data = train.iloc[0:1].drop(
                            [target_column], axis=1)
                        small_test_data = test.iloc[0:1].drop([target_column],
                                                              axis=1)
                        model.predict_proba(small_train_data)
                        model.predict_proba(small_test_data)
                    except Exception:
                        raise UserConfigValidationException(
                            'The model passed cannot be used for'
                            ' getting predictions via predict_proba()')
    def compute(self):
        """Computes the causal insights by running the causal configuration."""
        for config in self._causal_config_list:
            if config.is_computed:
                continue

            config.is_computed = True
            if config.nuisance_model not in [CausalConstants.AUTOML,
                                             CausalConstants.LINEAR]:
                message = (f"nuisance_model should be one of "
                           f"['{CausalConstants.AUTOML}', "
                           f"'{CausalConstants.LINEAR}'], "
                           f"got {config.nuisance_model}")
                raise UserConfigValidationException(message)

            is_classification = self._task_type == ModelTask.CLASSIFICATION
            X = pd.concat([self._train, self._test], ignore_index=True)\
                .drop([self._target_column], axis=1)
            y = pd.concat([self._train, self._test], ignore_index=True)[
                self._target_column].values.ravel()

            categoricals = self._categorical_features
            if categoricals is None:
                categoricals = []

            analysis = CausalAnalysis(
                config.treatment_features,
                categoricals,
                heterogeneity_inds=config.heterogeneity_features,
                classification=is_classification,
                nuisance_models=config.nuisance_model,
                upper_bound_on_cat_expansion=config.max_cat_expansion,
                skip_cat_limit_checks=config.skip_cat_limit_checks,
                n_jobs=-1)
            analysis.fit(X, y)

            config.causal_analysis = analysis

            X_test = self._test.drop([self._target_column], axis=1)

            config.global_effects = analysis.global_causal_effect(
                alpha=config.alpha, keep_all_levels=True)
            config.local_effects = analysis.local_causal_effect(
                X_test, alpha=config.alpha, keep_all_levels=True)

            config.policies = []
            for treatment_feature in config.treatment_features:
                local_policies = analysis.individualized_policy(
                    X_test, treatment_feature,
                    treatment_costs=config.treatment_cost,
                    alpha=config.alpha)

                tree = analysis._policy_tree_output(
                    X_test, treatment_feature,
                    treatment_costs=config.treatment_cost,
                    max_depth=config.max_tree_depth,
                    min_samples_leaf=config.min_tree_leaf_samples,
                    alpha=config.alpha)

                policy = {
                    self.TREATMENT_FEATURE: treatment_feature,
                    self.CONTROL_TREATMENT: tree.control_name,
                    self.LOCAL_POLICIES: local_policies,
                    self.POLICY_GAINS: {
                        self.RECOMMENDED_POLICY_GAINS: tree.policy_value,
                        self.TREATMENT_GAINS: tree.always_treat,
                    },
                    self.POLICY_TREE: tree.tree_dictionary
                }
                config.policies.append(policy)
    def add(
        self,
        treatment_features,
        heterogeneity_features=None,
        nuisance_model=CausalConstants.LINEAR,
        heterogeneity_model=None,
        alpha=CausalConstants.DEFAULT_ALPHA,
        upper_bound_on_cat_expansion=CausalConstants.DEFAULT_MAX_CAT_EXPANSION,
        treatment_cost=CausalConstants.DEFAULT_TREATMENT_COST,
        min_tree_leaf_samples=CausalConstants.DEFAULT_MIN_TREE_LEAF_SAMPLES,
        max_tree_depth=CausalConstants.DEFAULT_MAX_TREE_DEPTH,
        skip_cat_limit_checks=CausalConstants.DEFAULT_SKIP_CAT_LIMIT_CHECKS,
    ):
        """Add a causal configuration to be computed later.
        :param treatment_features: Treatment feature names.
        :type treatment_features: list
        :param heterogeneity_features: Features that mediate the causal effect.
        :type heterogeneity_features: list
        :param nuisance_model: Model type to use for nuisance estimation.
        :type nuisance_model: str
        :param heterogeneity_model: Model type to use for
                                    treatment effect heterogeneity.
        :type heterogeneity_model: str
        :param alpha: Confidence level of confidence intervals.
        :type alpha: float
        :param upper_bound_on_cat_expansion: Maximum expansion for
                                             categorical features.
        :type upper_bound_on_cat_expansion: int
        :param treatment_cost: Cost to treat one individual or
                               per-individual costs as an array.
        :type treatment_cost: float or array
        :param min_tree_leaf_samples: Minimum number of samples per leaf
                                      in policy tree.
        :type min_tree_leaf_samples: int
        :param max_tree_depth: Maximum depth of policy tree.
        :type max_tree_depth: int
        :param skip_cat_limit_checks: By default, categorical features need
                                      to have several instances of each
                                      category in order for a model to be
                                      fit robustly. Setting this to True
                                      will skip these checks.
        :type skip_cat_limit_checks: bool
        """
        if not set(treatment_features).issubset(set(self._train.columns)):
            raise UserConfigValidationException(
                'Found some feature names in treatment feature list which'
                ' do not occur in train data'
            )

        causal_config = CausalConfig(
            treatment_features,
            heterogeneity_features=heterogeneity_features,
            nuisance_model=nuisance_model,
            heterogeneity_model=heterogeneity_model,
            alpha=alpha,
            max_cat_expansion=upper_bound_on_cat_expansion,
            treatment_cost=treatment_cost,
            min_tree_leaf_samples=min_tree_leaf_samples,
            max_tree_depth=max_tree_depth,
            skip_cat_limit_checks=skip_cat_limit_checks)

        if causal_config.is_duplicate(self._causal_config_list):
            raise DuplicateManagerConfigException(
                "Duplicate causal configuration detected.")

        self._causal_config_list.append(causal_config)
    def compute(self):
        """Computes the causal effects by running the causal
           configuration."""
        is_classification = self._task_type == ModelTask.CLASSIFICATION
        for result in self._results:
            causal_config = result.config
            if not result.is_computed:
                analysis = CausalAnalysis(
                    causal_config.treatment_features,
                    self._categorical_features,
                    heterogeneity_inds=causal_config.heterogeneity_features,
                    classification=is_classification,
                    nuisance_models=causal_config.nuisance_model,
                    heterogeneity_model=causal_config.heterogeneity_model,
                    upper_bound_on_cat_expansion=causal_config.
                    upper_bound_on_cat_expansion,
                    skip_cat_limit_checks=causal_config.skip_cat_limit_checks,
                    n_jobs=causal_config.n_jobs,
                    categories=causal_config.categories,
                    verbose=causal_config.verbose,
                    random_state=causal_config.random_state,
                )

                X_train = self._train.drop([self._target_column], axis=1)
                X_test = self._test.drop([self._target_column], axis=1)
                y_train = self._train[self._target_column].values.ravel()

                self._fit_causal_analysis(
                    analysis, X_train, y_train,
                    causal_config.upper_bound_on_cat_expansion)
                result.causal_analysis = analysis

                result.global_effects = analysis.global_causal_effect(
                    alpha=causal_config.alpha, keep_all_levels=True)
                result.local_effects = analysis.local_causal_effect(
                    X_test, alpha=causal_config.alpha, keep_all_levels=True)

                result.policies = []

                # Check treatment_cost is valid
                if isinstance(causal_config.treatment_cost, int) and \
                        causal_config.treatment_cost == 0:
                    revised_treatment_cost = [0] * len(
                        causal_config.treatment_features)
                else:
                    revised_treatment_cost = causal_config.treatment_cost

                if not isinstance(revised_treatment_cost, list):
                    message = (
                        "treatment_cost must be a list with "
                        "the same number of elements as "
                        "treatment_features where each element "
                        "is either a constant cost of treatment "
                        "or an array specifying the cost of "
                        "treatment per sample. "
                        "Found treatment_cost of type "
                        f"{type(revised_treatment_cost)}, expected list.")
                    raise UserConfigValidationException(message)
                elif len(revised_treatment_cost) != \
                        len(causal_config.treatment_features):
                    message = ("treatment_cost must be a list with "
                               "the same number of elements as "
                               "treatment_features. "
                               "Length of treatment_cost was "
                               f"{len(revised_treatment_cost)}, expected "
                               f"{len(causal_config.treatment_features)}.")
                    raise UserConfigValidationException(message)

                for i in range(len(causal_config.treatment_features)):
                    policy = self._create_policy(
                        result, X_test,
                        causal_config.treatment_features[i],
                        revised_treatment_cost[i],
                        causal_config.alpha, causal_config.max_tree_depth,
                        causal_config.min_tree_leaf_samples)
                    result.policies.append(policy)

                result._validate_schema()
    def add(
        self,
        treatment_features: List[str],
        heterogeneity_features: Optional[List[str]] = None,
        nuisance_model: str = ModelTypes.LINEAR,
        heterogeneity_model: str = ModelTypes.LINEAR,
        alpha: float = DefaultParams.DEFAULT_ALPHA,
        upper_bound_on_cat_expansion: int = (
            DefaultParams.DEFAULT_MAX_CAT_EXPANSION),
        treatment_cost: Union[float, List[Union[float, np.ndarray]]] = (
            DefaultParams.DEFAULT_TREATMENT_COST),
        min_tree_leaf_samples: int = (
            DefaultParams.DEFAULT_MIN_TREE_LEAF_SAMPLES),
        max_tree_depth: int = (
            DefaultParams.DEFAULT_MAX_TREE_DEPTH),
        skip_cat_limit_checks: bool = (
            DefaultParams.DEFAULT_SKIP_CAT_LIMIT_CHECKS),
        categories: Union[str, List[Union[str, List[Any]]]] = (
            DefaultParams.DEFAULT_CATEGORIES),
        n_jobs: int = (
            DefaultParams.DEFAULT_N_JOBS),
        verbose: int = (
            DefaultParams.DEFAULT_VERBOSE),
        random_state: Optional[Union[int, np.random.RandomState]] = (
            DefaultParams.DEFAULT_RANDOM_STATE),
    ):
        """Compute causal insights.
        :param treatment_features: Treatment feature names.
        :type treatment_features: list
        :param heterogeneity_features: Features that mediate the causal effect.
        :type heterogeneity_features: list
        :param nuisance_model: This model used to estimate the outcome and the
            treatment features from the other features present in user data.
            It is one of {'linear', 'automl'}, optional (default='linear').
            If 'linear', then LassoCV (for regression) and
            LogisticRegressionCV (for classification) are used.
            If 'automl', then a k-fold cross-validation and model selection
            is performed among several models and the best is chosen.
        :type nuisance_model: str
        :param heterogeneity_model: The heterogeneity model is used to
            estimate the treatment effect based on the heterogeneity features.
            It is one of {'linear', 'forest'} (default='linear').
            'linear' means that a heterogeneity model of the form
            theta(X)=<a, X> will be used, while 'forest' means that a
            forest model will be trained instead.
        :type heterogeneity_model: str
        :param alpha: Confidence level of confidence intervals.
        :type alpha: float
        :param upper_bound_on_cat_expansion: Maximum expansion for
                                             categorical features.
        :type upper_bound_on_cat_expansion: int
        :param treatment_cost: Cost of treatment. If 0, all treatments will
            have zero cost. If a list is passed, then each element will be
            applied to each treatment feature. Each element can be a scalar
            value to indicate a constant cost of applying that treatment or
            an array indicating the cost for each sample. If the treatment
            is a discrete treatment, then the array for that feature should
            be two dimensional with the first dimension representing samples
            and the second representing the difference in cost between the
            non-default values and the default value.
        :type treatment_cost: None, List of float or array
        :param min_tree_leaf_samples: Minimum number of samples per leaf
            in policy tree.
        :type min_tree_leaf_samples: int
        :param max_tree_depth: Maximum depth of policy tree.
        :type max_tree_depth: int
        :param skip_cat_limit_checks:
            By default, categorical features need to have several instances
            of each category in order for a model to be fit robustly.
            Setting this to True will skip these checks.
        :type skip_cat_limit_checks: bool
        :param categories: 'auto' or list of category values, default 'auto'
            What categories to use for the categorical columns.
            If 'auto', then the categories will be inferred for all
            categorical columns. Otherwise, this argument should have
            as many entries as there are categorical columns.
            Each entry should be either 'auto' to infer the values for
            that column or the list of values for the column.
            If explicit values are provided, the first value is treated
            as the "control" value for that column against which other
            values are compared.
        :type categories: str or list
        :param n_jobs: Degree of parallelism to use when training models
            via joblib.Parallel
        :type n_jobs: int
        :param verbose: Controls the verbosity when fitting and predicting.
        :type verbose: int
        :param random_state: Controls the randomness of the estimator.
        :type random_state: int or RandomState or None
        """
        difference_set = set(treatment_features) - set(self._train.columns)
        if len(difference_set) > 0:
            message = ("Feature names in treatment_features do "
                       f"not exist in train data: {list(difference_set)}")
            raise UserConfigValidationException(message)

        if heterogeneity_features is not None:
            difference_set = \
                set(heterogeneity_features) - set(self._train.columns)
            if len(difference_set) > 0:
                message = ("Feature names in heterogeneity_features do "
                           f"not exist in train data: {list(difference_set)}")
                raise UserConfigValidationException(message)

        if self._task_type == ModelTask.CLASSIFICATION:
            is_multiclass = len(np.unique(
                self._train[self._target_column].values).tolist()) > 2
            if is_multiclass:
                raise UserConfigValidationException(
                    "Multiclass classification isn't supported")

        if nuisance_model not in [ModelTypes.AUTOML,
                                  ModelTypes.LINEAR]:
            message = (f"nuisance_model should be one of "
                       f"['{ModelTypes.AUTOML}', "
                       f"'{ModelTypes.LINEAR}'], "
                       f"got {nuisance_model}")
            raise UserConfigValidationException(message)

        if heterogeneity_model not in [ModelTypes.FOREST,
                                       ModelTypes.LINEAR]:
            message = (f"heterogeneity_model should be one of "
                       f"['{ModelTypes.FOREST}', "
                       f"'{ModelTypes.LINEAR}'], "
                       f"got {heterogeneity_model}")
            raise UserConfigValidationException(message)

        validate_train_test_categories(
            train_data=self._train,
            test_data=self._test,
            rai_compute_type='Causal analysis',
            categoricals=self._categorical_features)

        result = CausalResult()
        result.config = CausalConfig(
            treatment_features=treatment_features,
            heterogeneity_features=heterogeneity_features,
            nuisance_model=nuisance_model,
            heterogeneity_model=heterogeneity_model,
            alpha=alpha,
            upper_bound_on_cat_expansion=upper_bound_on_cat_expansion,
            treatment_cost=treatment_cost,
            min_tree_leaf_samples=min_tree_leaf_samples,
            max_tree_depth=max_tree_depth,
            skip_cat_limit_checks=skip_cat_limit_checks,
            n_jobs=n_jobs,
            categories=categories,
            verbose=verbose,
            random_state=random_state,
            categorical_features=self._categorical_features,
        )
        self._results.append(result)
Ejemplo n.º 23
0
    def _validate_cohort_filter_parameters(self, method: str, arg: List[Any],
                                           column: str):
        """Validate the input values for the cohort filter.
        :param method: Cohort filter method from one of CohortFilterMethods.
        :type method: str
        :param arg: List of values to be used by the cohort filter.
        :type arg: list
        :param column: The column name from the dataset on which the filter
                       will be applied.
        :type column: str

        The following validations can be performed on the cohort filter:-

        1. Verify the correct types for method (expected string), column
           (expected string) and arg (expected list).
        2. The method value should be one of the filter string from
           CohortFilterMethods.ALL.
        3. The arg shouldn't be an empty list.
        4. For all cohort filter methods in
           CohortFilterMethods.SINGLE_VALUE_METHODS, the value in the arg
           should be integer or float and there should be only one value
           in arg.
        5. For cohort filter method CohortFilterMethods.METHOD_RANGE,
           the values in the arg should be integer or float and there
           should be only two values in arg.
        """
        if not isinstance(method, str):
            raise UserConfigValidationException(
                "Got unexpected type {0} for method. "
                "Expected string type.".format(type(method)))
        if method not in CohortFilterMethods.ALL:
            raise UserConfigValidationException(
                "Got unexpected value {0} for method. "
                "Expected either of {1}.".format(
                    method, " or ".join(CohortFilterMethods.ALL)))
        if not isinstance(column, str):
            raise UserConfigValidationException(
                "Got unexpected type {0} for column. "
                "Expected string type.".format(type(column)))
        if not isinstance(arg, list):
            raise UserConfigValidationException(
                "Got unexpected type {0} for arg. "
                "Expected list type.".format(type(arg)))
        if len(arg) == 0:
            raise UserConfigValidationException("Empty list supplied for arg.")

        if method in CohortFilterMethods.SINGLE_VALUE_METHODS:
            if len(arg) != 1:
                raise UserConfigValidationException(
                    "Expected a single value in arg "
                    "for cohort methods {0}.".format(" or ".join(
                        CohortFilterMethods.SINGLE_VALUE_METHODS)))
            if not isinstance(arg[0], int) and not isinstance(arg[0], float):
                raise UserConfigValidationException(
                    "Expected int or float type for "
                    "arg with cohort methods {0}.".format(" or ".join(
                        CohortFilterMethods.SINGLE_VALUE_METHODS)))

        if method == CohortFilterMethods.METHOD_RANGE:
            if len(arg) != 2:
                raise UserConfigValidationException(
                    "Expected two entries in arg for "
                    "cohort method {0}.".format(
                        CohortFilterMethods.METHOD_RANGE))
            if (not all(isinstance(entry, int) for entry in arg)
                    and not all(isinstance(entry, float) for entry in arg)):
                raise UserConfigValidationException(
                    "Expected int or float type for arg "
                    "with cohort method {0}.".format(
                        CohortFilterMethods.METHOD_RANGE))
Ejemplo n.º 24
0
    def _validate_with_test_data(self,
                                 test_data: pd.DataFrame,
                                 target_column: str,
                                 categorical_features: List[str],
                                 is_classification: Optional[bool] = True):
        """
        Validate the cohort filters parameters with respect to test data.

        :param test_data: Test data over which cohort analysis will be done
            in ResponsibleAI Dashboard.
        :type test_data: pd.DataFrame
        :param target_column: The target column in the test data.
        :type target_column: str
        :param categorical_features: The categorical feature names.
        :type categorical_features: list[str]
        :param is_classification: True to indicate if this validation needs
            to be done for a classification scenario and False to indicate
            that this needs to be done for regression scenario.
        :type is_classification: bool

        The following validations need to be performed for cohort filter with
        test data:-

        High level validations
        1. Validate if the filter column is present in the test data.
        2. Validate if the filter column is present in the special column
           list.

        "Index" Filter validations
        1. The Index filter only takes integer arguments.
        2. The Index filter doesn't take CohortFilterMethods.EXCLUDES
           filter method.

        "Classification outcome" Filter validations
        1. Validate that "Classification outcome" filter is not configure for
           multiclass classification and regression.
        2. The "Classification outcome" filter only contains values from set
           ClassificationOutcomes.
        3. The "Classification outcome" filter only takes
           CohortFilterMethods.INCLUDES filter method.

        "Error" Filter validations
        1. Validate that "Error" filter is not configure for
           multiclass classification and binary classification.
        2. Only integer or floating points can be configured as arguments.
        3. The CohortFilterMethods.INCLUDES and CohortFilterMethods.EXCLUDES
           filter methods cannot be configured for this filter.

        "Predicted Y/True Y" Filter validations
        1. The set of classes configured in case of classification is a
           superset of the classes available in the test data.
        2. The CohortFilterMethods.INCLUDES is only allowed to be
           configured for "Predicted Y" filter in case of classification.
        3. The CohortFilterMethods.INCLUDES and CohortFilterMethods.EXCLUDES
           filter methods cannot be configured for this filter for regression.

        "Dataset" Filter validations
        1. TODO:- For continuous features the allowed values that be configured
           should be within the range of minimum and maximum values available
           within the continuous feature column in the test data.
        2. For categorical features only CohortFilterMethods.INCLUDES can be
           configured.
        3. For categorical features the values allowed are a subset of the
           the values available in the categorical column in the test data.
        """
        # High level validations
        if self.column not in CohortFilter.SPECIAL_COLUMN_LIST and \
                (self.column not in
                    (set(test_data.columns) - set([target_column]))):
            raise UserConfigValidationException(
                "Unknown column {0} specified in cohort filter".format(
                    self.column))

        if self.column == CohortFilter.INDEX:
            # "Index" Filter validations
            if self.method == CohortFilterMethods.METHOD_EXCLUDES:
                raise UserConfigValidationException(
                    "{0} filter is not supported with {1} based "
                    "selection.".format(CohortFilterMethods.METHOD_EXCLUDES,
                                        CohortFilter.INDEX))

            if not all(isinstance(entry, int) for entry in self.arg):
                raise UserConfigValidationException(
                    "All entries in arg should be of type int.")
        elif self.column == CohortFilter.CLASSIFICATION_OUTCOME:
            # "Classification outcome" Filter validations
            is_multiclass = len(
                np.unique(test_data[target_column].values).tolist()) > 2

            if not is_classification or is_multiclass:
                raise UserConfigValidationException(
                    "{0} cannot be configured for multi-class classification"
                    " and regression scenarios.".format(
                        CohortFilter.CLASSIFICATION_OUTCOME))

            if self.method != CohortFilterMethods.METHOD_INCLUDES:
                raise UserConfigValidationException(
                    "{0} can only be configured with "
                    "cohort filter {1}.".format(
                        CohortFilter.CLASSIFICATION_OUTCOME,
                        CohortFilterMethods.METHOD_INCLUDES))

            for classification_outcome in self.arg:
                if classification_outcome not in ClassificationOutcomes.ALL:
                    raise UserConfigValidationException(
                        "{0} can only take argument values from {1}.".format(
                            CohortFilter.CLASSIFICATION_OUTCOME,
                            " or ".join(ClassificationOutcomes.ALL)))
        elif self.column == CohortFilter.REGRESSION_ERROR:
            # "Error" Filter validations
            if is_classification:
                raise UserConfigValidationException(
                    "{0} cannot be configured for classification"
                    " scenarios.".format(CohortFilter.REGRESSION_ERROR))

            if self.method == CohortFilterMethods.METHOD_INCLUDES or \
                    self.method == CohortFilterMethods.METHOD_EXCLUDES:
                raise UserConfigValidationException(
                    "{0} cannot be configured with either {1} or {2}.".format(
                        CohortFilter.REGRESSION_ERROR,
                        CohortFilterMethods.METHOD_INCLUDES,
                        CohortFilterMethods.METHOD_EXCLUDES))

            if not all(isinstance(entry, int) for entry in self.arg) and \
                    not all(isinstance(entry, float) for entry in self.arg):
                raise UserConfigValidationException(
                    "All entries in arg should be of type int or float"
                    " for {} cohort.".format(CohortFilter.REGRESSION_ERROR))
        elif self.column == CohortFilter.PREDICTED_Y or \
                self.column == CohortFilter.TRUE_Y:
            # "Predicted Y/True Y" Filter validations
            if is_classification:
                if self.method != CohortFilterMethods.METHOD_INCLUDES:
                    raise UserConfigValidationException(
                        "{0} can only be configured with "
                        "filter {1} for classification".format(
                            self.column, CohortFilterMethods.METHOD_INCLUDES))

                test_classes = np.unique(
                    test_data[target_column].values).tolist()

                if not all(entry in test_classes for entry in self.arg):
                    raise UserConfigValidationException(
                        "Found a class in arg which is not present in "
                        "test data")
            else:
                if self.method == CohortFilterMethods.METHOD_INCLUDES or \
                        self.method == CohortFilterMethods.METHOD_EXCLUDES:
                    raise UserConfigValidationException(
                        "{0} cannot be configured with "
                        "filter {1} for regression.".format(
                            self.column, self.method))
        else:
            # "Dataset" Filter validations
            if self.column in categorical_features:
                if self.method != CohortFilterMethods.METHOD_INCLUDES:
                    raise UserConfigValidationException(
                        "{0} is a categorical feature and should be only "
                        "configured with {1} cohort filter.".format(
                            self.column, CohortFilterMethods.METHOD_INCLUDES))

                categories = np.unique(test_data[self.column].values).tolist()

                for entry in self.arg:
                    if entry not in categories:
                        raise UserConfigValidationException(
                            "Found a category {0} in arg which is not present "
                            "in test data column {1}.".format(
                                entry, self.column))