コード例 #1
0
    def test_sets_data_balance_measures(
        self,
        adult_data_feature_balance_measures,
        adult_data_distribution_balance_measures,
        adult_data_aggregate_balance_measures,
    ):
        manager = DataBalanceManager(
            target_column=None,
            classes=None,
            train=None,
            test=None,
            task_type=None,
        )
        assert manager._data_balance_measures is None

        manager._set_data_balance_measures(
            adult_data_feature_balance_measures,
            adult_data_distribution_balance_measures,
            adult_data_aggregate_balance_measures,
        )
        assert manager._data_balance_measures is not None

        d = manager._data_balance_measures
        assert isinstance(d, dict)
        assert FEATURE_BALANCE_MEASURES_KEY in d
        assert DISTRIBUTION_BALANCE_MEASURES_KEY in d
        assert AGGREGATE_BALANCE_MEASURES_KEY in d
コード例 #2
0
 def test_validate_with_valid_input(self, adult_data):
     train_df, test_df, categorical_cols, target_col, classes = adult_data
     manager = DataBalanceManager(
         train=train_df,
         test=test_df,
         target_column=target_col,
         classes=classes,
         task_type=TaskType.CLASSIFICATION,
     )
     manager._cols_of_interest = categorical_cols
     manager._validate()  # should not raise any exceptions
コード例 #3
0
    def test_validate_with_invalid_classes(self, adult_data, classes):
        train_df, test_df, _, target_col, _ = adult_data

        with pytest.raises(ValueError):
            manager = DataBalanceManager(
                train=train_df,
                test=test_df,
                target_column=target_col,
                classes=classes,
                task_type=TaskType.CLASSIFICATION,
            )
            manager._validate()
コード例 #4
0
    def test_add_errors_on_invalid_input_basic(self, adult_data, target_col,
                                               cols_of_interest):
        train_df, test_df, _, _, classes = adult_data

        with pytest.raises(ValueError):
            manager = DataBalanceManager(
                train=train_df,
                test=test_df,
                target_column=target_col,
                classes=classes,
                task_type=TaskType.CLASSIFICATION,
            )
            manager.add(cols_of_interest=cols_of_interest)
コード例 #5
0
    def test_save_and_load_basic(self, tmpdir, adult_data):
        train_df, test_df, categorical_cols, target_col, classes = adult_data
        task_type = TaskType.CLASSIFICATION

        saved = DataBalanceManager(
            train=train_df,
            test=test_df,
            target_column=target_col,
            classes=classes,
            task_type=task_type,
        )
        assert saved._is_added is False

        save_dir = tmpdir.mkdir("save-dir")
        saved._save(save_dir)  # should not create any configs

        rai_insights = RAIInsights(
            model=None,
            train=train_df,
            test=test_df,
            target_column=target_col,
            task_type="classification",
            categorical_features=categorical_cols,
        )
        loaded = saved._load(save_dir, rai_insights)

        assert saved._target_column == loaded._target_column == target_col
        assert saved._classes == loaded._classes == list(map(str, classes))
        assert saved._task_type == loaded._task_type == task_type
        assert_frame_equal(saved._train, loaded._train)
        assert_frame_equal(saved._test, loaded._test)
        assert saved._is_added is loaded._is_added is False

        # All the instances variables from RAIInsights get set.
        assert_frame_equal(loaded._train, saved._train)
        assert_frame_equal(loaded._test, saved._test)
        assert loaded._target_column == saved._target_column
        # Also, df gets created using rai_insights.train and rai_insights.test
        assert_frame_equal(loaded._df, saved._df)

        # All the instance variables specific to the manager, such as
        # _cols_of_interest, don't get set.
        assert loaded._is_added is False
        assert loaded._cols_of_interest is None
        assert loaded._data_balance_measures is None
コード例 #6
0
    def _initialize_managers(self):
        """Initializes the managers.

        Initialized the causal, counterfactual, error analysis
        and explainer managers.
        """
        self._causal_manager = CausalManager(self.train, self.test,
                                             self.target_column,
                                             self.task_type,
                                             self.categorical_features)

        self._counterfactual_manager = CounterfactualManager(
            model=self.model,
            train=self.train,
            test=self.test,
            target_column=self.target_column,
            task_type=self.task_type,
            categorical_features=self.categorical_features)

        self._data_balance_manager = DataBalanceManager(
            train=self.train,
            test=self.test,
            target_column=self.target_column,
            classes=self._classes,
            task_type=self.task_type)

        self._error_analysis_manager = ErrorAnalysisManager(
            self.model, self.test, self.target_column, self._classes,
            self.categorical_features)

        self._explainer_manager = ExplainerManager(
            self.model,
            self.train,
            self.test,
            self.target_column,
            self._classes,
            categorical_features=self.categorical_features)

        self._managers = [
            self._causal_manager, self._counterfactual_manager,
            self._data_balance_manager, self._error_analysis_manager,
            self._explainer_manager
        ]
コード例 #7
0
 def test_init_with_invalid_input(self):
     manager = DataBalanceManager(
         train=None,
         test=None,
         target_column=None,
         classes=None,
         task_type=None,
     )
     assert manager._target_column is None
     assert manager._classes is None
     assert manager._task_type is None
     assert manager._train is None
     assert manager._test is None
     assert manager._df is None
コード例 #8
0
    def test_save_and_load_with_add_and_compute(self, tmpdir, adult_data):
        train_df, test_df, categorical_cols, target_col, classes = adult_data
        task_type = TaskType.CLASSIFICATION

        saved = DataBalanceManager(
            train=train_df,
            test=test_df,
            target_column=target_col,
            classes=classes,
            task_type=task_type,
        )
        saved.add(cols_of_interest=categorical_cols)
        saved.compute()

        save_dir = tmpdir.mkdir("save-dir")
        saved._save(save_dir)

        rai_insights = RAIInsights(
            model=None,
            train=train_df,
            test=test_df,
            target_column=target_col,
            task_type="classification",
            categorical_features=categorical_cols,
        )
        loaded = saved._load(save_dir, rai_insights)

        assert saved._target_column == loaded._target_column == target_col
        assert saved._classes == loaded._classes == list(map(str, classes))
        assert saved._task_type == loaded._task_type == task_type
        assert_frame_equal(saved._train, loaded._train)
        assert_frame_equal(saved._test, loaded._test)
        assert saved._is_added is loaded._is_added is True
        assert_frame_equal(saved._df, loaded._df)

        assert saved._cols_of_interest == loaded._cols_of_interest

        assert saved._data_balance_measures == loaded._data_balance_measures
コード例 #9
0
    def test_compute_transforms_and_sets_data_balance_measures(
            self, adult_data):
        train_df, test_df, categorical_cols, target_col, classes = adult_data

        manager = DataBalanceManager(
            train=train_df,
            test=test_df,
            target_column=target_col,
            classes=classes,
            task_type=TaskType.CLASSIFICATION,
        )
        manager.add(cols_of_interest=categorical_cols)

        assert manager._data_balance_measures is None
        manager.compute()
        assert manager._data_balance_measures is not None
コード例 #10
0
    def test_validate_with_invalid_input_advanced(self, adult_data):
        train, test, _, target_col, classes = adult_data
        # train and test not specified
        with pytest.raises(ValueError):
            manager = DataBalanceManager(
                train=None,
                test=None,
                target_column=target_col,
                classes=classes,
                task_type=TaskType.CLASSIFICATION,
            )
            manager._validate()

        # task_type is not classification
        with pytest.raises(ValueError):
            manager = DataBalanceManager(
                train=train,
                test=test,
                target_column=target_col,
                classes=classes,
                task_type=TaskType.REGRESSION,
            )
            manager._validate()
コード例 #11
0
    def test_add_errors_on_invalid_input_advanced(self, adult_data):
        train_df, test_df, categorical_cols, target_col, classes = adult_data

        # train and test not specified
        with pytest.raises(ValueError):
            manager = DataBalanceManager(
                train=None,
                test=None,
                target_column=target_col,
                classes=classes,
                task_type=TaskType.CLASSIFICATION,
            )
            manager.add(cols_of_interest=categorical_cols)

        # task_type is not classification
        with pytest.raises(ValueError):
            manager = DataBalanceManager(
                train=train_df,
                test=test_df,
                target_column=target_col,
                classes=classes,
                task_type=TaskType.REGRESSION,
            )
            manager.add(cols_of_interest=categorical_cols)
コード例 #12
0
    def test_init_with_valid_input(self, adult_data):
        train_df, test_df, _, target_col, classes = adult_data
        combined = pd.concat([train_df, test_df])

        # Try a combination of train only, test only, and both
        # Since fixtures are being used, easier not to use pytest.parametrize
        combinations = [(train_df, None), (None, test_df), (train_df, test_df)]
        for train, test in combinations:
            combined = pd.concat([train, test])
            manager = DataBalanceManager(
                train=train,
                test=test,
                target_column=target_col,
                classes=classes,
                task_type=TaskType.CLASSIFICATION,
            )
            assert manager._target_column == target_col
            assert manager._train is train
            assert manager._test is test
            assert_frame_equal(
                manager._df.reset_index(drop=True),
                combined.reset_index(drop=True),
            )
コード例 #13
0
class RAIInsights(RAIBaseInsights):
    """Defines the top-level Model Analysis API.
    Use RAIInsights to analyze errors, explain the most important
    features, compute counterfactuals and run causal analysis in a
    single API.
    """
    def __init__(self,
                 model: Optional[Any],
                 train: pd.DataFrame,
                 test: pd.DataFrame,
                 target_column: str,
                 task_type: str,
                 categorical_features: Optional[List[str]] = None,
                 classes: Optional[np.ndarray] = None,
                 serializer: Optional[Any] = None,
                 maximum_rows_for_test: int = 5000,
                 feature_metadata: Optional[FeatureMetadata] = None):
        """Creates an RAIInsights object.
        :param model: The model to compute RAI insights for.
            A model that implements sklearn.predict or sklearn.predict_proba
            or function that accepts a 2d ndarray.
        :type model: object
        :param train: The training dataset including the label column.
        :type train: pandas.DataFrame
        :param test: The test dataset including the label column.
        :type test: pandas.DataFrame
        :param target_column: The name of the label column.
        :type target_column: str
        :param task_type: The task to run, can be `classification` or
            `regression`.
        :type task_type: str
        :param categorical_features: The categorical feature names.
        :type categorical_features: list[str]
        :param classes: The class labels in the training dataset
        :type classes: numpy.ndarray
        :param serializer: Picklable custom serializer with save and load
            methods for custom model serialization.
            The save method writes the model to file given a parent directory.
            The load method returns the deserialized model from the same
            parent directory.
        :type serializer: object
        :param maximum_rows_for_test: Limit on size of test data
            (for performance reasons)
        :type maximum_rows_for_test: int
        :param feature_metadata: Feature metadata for the train/test
                                 dataset to identify different kinds
                                 of features in the dataset.
        :type feature_metadata: FeatureMetadata
        """
        categorical_features = categorical_features or []
        self._validate_rai_insights_input_parameters(
            model=model,
            train=train,
            test=test,
            target_column=target_column,
            task_type=task_type,
            categorical_features=categorical_features,
            classes=classes,
            serializer=serializer,
            maximum_rows_for_test=maximum_rows_for_test,
            feature_metadata=feature_metadata)
        self._classes = RAIInsights._get_classes(task_type=task_type,
                                                 train=train,
                                                 target_column=target_column,
                                                 classes=classes)
        self._feature_columns = \
            test.drop(columns=[target_column]).columns.tolist()
        self._feature_ranges = RAIInsights._get_feature_ranges(
            test=test,
            categorical_features=categorical_features,
            feature_columns=self._feature_columns)
        self._feature_metadata = feature_metadata

        self.categorical_features = categorical_features
        self._categories, self._categorical_indexes, \
            self._category_dictionary, self._string_ind_data = \
            process_categoricals(
                all_feature_names=self._feature_columns,
                categorical_features=self.categorical_features,
                dataset=test.drop(columns=[target_column]))

        super(RAIInsights, self).__init__(model, train, test, target_column,
                                          task_type, serializer)

        self._try_add_data_balance()

    def _initialize_managers(self):
        """Initializes the managers.

        Initialized the causal, counterfactual, error analysis
        and explainer managers.
        """
        self._causal_manager = CausalManager(self.train, self.test,
                                             self.target_column,
                                             self.task_type,
                                             self.categorical_features)

        self._counterfactual_manager = CounterfactualManager(
            model=self.model,
            train=self.train,
            test=self.test,
            target_column=self.target_column,
            task_type=self.task_type,
            categorical_features=self.categorical_features)

        self._data_balance_manager = DataBalanceManager(
            train=self.train,
            test=self.test,
            target_column=self.target_column,
            classes=self._classes,
            task_type=self.task_type)

        self._error_analysis_manager = ErrorAnalysisManager(
            self.model, self.test, self.target_column, self._classes,
            self.categorical_features)

        self._explainer_manager = ExplainerManager(
            self.model,
            self.train,
            self.test,
            self.target_column,
            self._classes,
            categorical_features=self.categorical_features)

        self._managers = [
            self._causal_manager, self._counterfactual_manager,
            self._data_balance_manager, self._error_analysis_manager,
            self._explainer_manager
        ]

    @staticmethod
    def _get_classes(task_type, train, target_column, classes):
        if task_type == ModelTask.CLASSIFICATION:
            if classes is None:
                classes = train[target_column].unique()
                # sort the classes after calling unique in numeric case
                classes.sort()
                return classes
            else:
                return classes
        else:
            return None

    def _try_add_data_balance(self):
        """
        Add data balance measures to be computed on categorical features
        if it is a classification task.
        """
        if self.task_type == ModelTask.CLASSIFICATION and \
                len(self.categorical_features) > 0 and \
                self._classes is not None:
            self._data_balance_manager.add(
                cols_of_interest=self.categorical_features)

    def _validate_rai_insights_input_parameters(
            self,
            model: Any,
            train: pd.DataFrame,
            test: pd.DataFrame,
            target_column: str,
            task_type: str,
            categorical_features: List[str],
            classes: np.ndarray,
            serializer,
            maximum_rows_for_test: int,
            feature_metadata: Optional[FeatureMetadata] = None):
        """Validate the inputs for the RAIInsights constructor.

        :param model: The model to compute RAI insights for.
            A model that implements sklearn.predict or sklearn.predict_proba
            or function that accepts a 2d ndarray.
        :type model: object
        :param train: The training dataset including the label column.
        :type train: pandas.DataFrame
        :param test: The test dataset including the label column.
        :type test: pandas.DataFrame
        :param target_column: The name of the label column.
        :type target_column: str
        :param task_type: The task to run, can be `classification` or
            `regression`.
        :type task_type: str
        :param categorical_features: The categorical feature names.
        :type categorical_features: list[str]
        :param classes: The class labels in the training dataset
        :type classes: numpy.ndarray
        :param serializer: Picklable custom serializer with save and load
            methods defined for model that is not serializable. The save
            method returns a dictionary state and load method returns the
            model.
        :type serializer: object
        :param maximum_rows_for_test: Limit on size of test data
            (for performance reasons)
        :type maximum_rows_for_test: int
        :param feature_metadata: Feature metadata for the train/test
                                 dataset to identify different kinds
                                 of features in the dataset.
        :type feature_metadata: FeatureMetadata
        """

        valid_tasks = [
            ModelTask.CLASSIFICATION.value, ModelTask.REGRESSION.value
        ]
        if task_type not in valid_tasks:
            message = (f"Unsupported task type '{task_type}'. "
                       f"Should be one of {valid_tasks}")
            raise UserConfigValidationException(message)

        if model is None:
            warnings.warn(
                'INVALID-MODEL-WARNING: No valid model is supplied. '
                'The explanations, error analysis and counterfactuals '
                'may not work')
            if serializer is not None:
                raise UserConfigValidationException(
                    'No valid model is specified but model '
                    'serializer provided.')

        if serializer is not None:
            if not hasattr(serializer, 'save'):
                raise UserConfigValidationException(
                    'The serializer does not implement save()')

            if not hasattr(serializer, 'load'):
                raise UserConfigValidationException(
                    'The serializer does not implement load()')

            try:
                pickle.dumps(serializer)
            except Exception:
                raise UserConfigValidationException(
                    'The serializer should be serializable via pickle')

        if isinstance(train, pd.DataFrame) and isinstance(test, pd.DataFrame):
            if test.shape[0] > maximum_rows_for_test:
                msg_fmt = 'The test data has {0} rows, ' +\
                    'but limit is set to {1} rows. ' +\
                    'Please resample the test data or ' +\
                    'adjust maximum_rows_for_test'
                raise UserConfigValidationException(
                    msg_fmt.format(test.shape[0], maximum_rows_for_test))

            if len(set(train.columns) - set(test.columns)) != 0 or \
                    len(set(test.columns) - set(train.columns)):
                raise UserConfigValidationException(
                    'The features in train and test data do not match')

            if target_column not in list(train.columns) or \
                    target_column not in list(test.columns):
                raise UserConfigValidationException(
                    'Target name {0} not present in train/test data'.format(
                        target_column))

            if categorical_features is not None and \
                    len(categorical_features) > 0:
                if target_column in categorical_features:
                    raise UserConfigValidationException(
                        'Found target name {0} in '
                        'categorical feature list'.format(target_column))

                difference_set = set(categorical_features) - set(train.columns)
                if len(difference_set) > 0:
                    message = ("Feature names in categorical_features "
                               "do not exist in train data: "
                               f"{list(difference_set)}")
                    raise UserConfigValidationException(message)

                for column in categorical_features:
                    try:
                        np.unique(train[column])
                    except Exception:
                        raise UserConfigValidationException(
                            "Error finding unique values in column {0}. "
                            "Please check your train data.".format(column))

                    try:
                        np.unique(test[column])
                    except Exception:
                        raise UserConfigValidationException(
                            "Error finding unique values in column {0}. "
                            "Please check your test data.".format(column))

            if classes is not None and task_type == \
                    ModelTask.CLASSIFICATION:
                if len(set(train[target_column].unique()) -
                       set(classes)) != 0 or \
                        len(set(classes) -
                            set(train[target_column].unique())) != 0:
                    raise UserConfigValidationException(
                        'The train labels and distinct values in '
                        'target (train data) do not match')

                if len(set(test[target_column].unique()) -
                       set(classes)) != 0 or \
                        len(set(classes) -
                            set(test[target_column].unique())) != 0:
                    raise UserConfigValidationException(
                        'The train labels and distinct values in '
                        'target (test data) do not match')

            if model is not None:
                # Pick one row from train and test data
                small_train_data = train.iloc[0:1].drop([target_column],
                                                        axis=1)
                small_test_data = test.iloc[0:1].drop([target_column], axis=1)

                small_train_features_before = list(small_train_data.columns)

                # Run predict() of the model
                try:
                    model.predict(small_train_data)
                    model.predict(small_test_data)
                except Exception:
                    raise UserConfigValidationException(
                        'The model passed cannot be used for'
                        ' getting predictions via predict()')
                self._validate_features_same(small_train_features_before,
                                             small_train_data, SKLearn.PREDICT)

                # Run predict_proba() of the model
                if task_type == ModelTask.CLASSIFICATION:
                    try:
                        model.predict_proba(small_train_data)
                        model.predict_proba(small_test_data)
                    except Exception:
                        raise UserConfigValidationException(
                            'The model passed cannot be used for'
                            ' getting predictions via predict_proba()')
                self._validate_features_same(small_train_features_before,
                                             small_train_data,
                                             SKLearn.PREDICT_PROBA)

                if task_type == ModelTask.REGRESSION:
                    if hasattr(model, SKLearn.PREDICT_PROBA):
                        raise UserConfigValidationException(
                            'The regression model'
                            'provided has a predict_proba function. '
                            'Please check the task_type.')
        else:
            raise UserConfigValidationException(
                "Unsupported data type for either train or test. "
                "Expecting pandas DataFrame for train and test.")

        if feature_metadata is not None:
            if not isinstance(feature_metadata, FeatureMetadata):
                raise UserConfigValidationException(
                    "Expecting type FeatureMetadata but got {0}".format(
                        type(feature_metadata)))

            feature_metadata.validate_feature_metadata_with_user_features(
                list(train.columns))

    def _validate_features_same(self, small_train_features_before,
                                small_train_data, function):
        """
        Validate the features are unmodified on the DataFrame.

        :param small_train_features_before: The features saved before
            an operation was performed.
        :type small_train_features_before: list[str]
        :param small_train_data: The DataFrame after the operation.
        :type small_train_data: pandas.DataFrame
        :param function: The name of the operation performed.
        :type function: str
        """
        small_train_features_after = list(small_train_data.columns)
        if small_train_features_before != small_train_features_after:
            raise UserConfigValidationException(
                ('Calling model {} function modifies '
                 'input dataset features. Please check if '
                 'predict function is defined correctly.').format(function))

    @property
    def causal(self) -> CausalManager:
        """Get the causal manager.
        :return: The causal manager.
        :rtype: CausalManager
        """
        return self._causal_manager

    @property
    def counterfactual(self) -> CounterfactualManager:
        """Get the counterfactual manager.
        :return: The counterfactual manager.
        :rtype: CounterfactualManager
        """
        return self._counterfactual_manager

    @property
    def error_analysis(self) -> ErrorAnalysisManager:
        """Get the error analysis manager.
        :return: The error analysis manager.
        :rtype: ErrorAnalysisManager
        """
        return self._error_analysis_manager

    @property
    def explainer(self) -> ExplainerManager:
        """Get the explainer manager.
        :return: The explainer manager.
        :rtype: ExplainerManager
        """
        return self._explainer_manager

    def get_filtered_test_data(self,
                               filters,
                               composite_filters,
                               include_original_columns_only=False):
        """Get the filtered test data based on cohort filters.

        :param filters: The filters to apply.
        :type filters: list[Filter]
        :param composite_filters: The composite filters to apply.
        :type composite_filters: list[CompositeFilter]
        :param include_original_columns_only: Whether to return the original
                                              data columns.
        :type include_original_columns_only: bool
        :return: The filtered test data.
        :rtype: pandas.DataFrame
        """
        pred_y = self.model.predict(
            self.test.drop(columns=[self.target_column]))
        filter_data_with_cohort = FilterDataWithCohortFilters(
            model=self.model,
            dataset=self.test.drop(columns=[self.target_column]),
            features=self.test.drop(columns=[self.target_column]).columns,
            categorical_features=self.categorical_features,
            categories=self._categories,
            true_y=self.test[self.target_column],
            pred_y=pred_y,
            model_task=self.task_type)

        return filter_data_with_cohort.filter_data_from_cohort(
            filters=filters,
            composite_filters=composite_filters,
            include_original_columns_only=include_original_columns_only)

    def get_data(self):
        """Get all data as RAIInsightsData object

        :return: Model Analysis Data
        :rtype: RAIInsightsData
        """
        data = RAIInsightsData()
        data.dataset = self._get_dataset()
        data.modelExplanationData = self.explainer.get_data()
        data.errorAnalysisData = self.error_analysis.get_data()
        data.causalAnalysisData = self.causal.get_data()
        data.counterfactualData = self.counterfactual.get_data()
        return data

    def _get_dataset(self):
        dashboard_dataset = Dataset()
        dashboard_dataset.task_type = self.task_type
        dashboard_dataset.categorical_features = self.categorical_features
        dashboard_dataset.class_names = convert_to_list(self._classes)

        if self._feature_metadata is not None:
            dashboard_dataset.feature_metadata = \
                self._feature_metadata.to_dict()
        else:
            dashboard_dataset.feature_metadata = None

        dashboard_dataset.data_balance_measures = \
            self._data_balance_manager.get_data()

        predicted_y = None
        feature_length = None

        dataset: pd.DataFrame = self.test.drop([self.target_column], axis=1)

        if isinstance(dataset, pd.DataFrame) and hasattr(dataset, 'columns'):
            self._dataframeColumns = dataset.columns
        try:
            list_dataset = convert_to_list(dataset)
        except Exception as ex:
            raise ValueError("Unsupported dataset type") from ex
        if dataset is not None and self.model is not None:
            try:
                predicted_y = self.model.predict(dataset)
            except Exception as ex:
                msg = "Model does not support predict method for given"
                "dataset type"
                raise ValueError(msg) from ex
            try:
                predicted_y = convert_to_list(predicted_y)
            except Exception as ex:
                raise ValueError(
                    "Model prediction output of unsupported type,") from ex
        if predicted_y is not None:
            if (self.task_type == "classification"
                    and dashboard_dataset.class_names is not None):
                predicted_y = [
                    dashboard_dataset.class_names.index(y) for y in predicted_y
                ]
            dashboard_dataset.predicted_y = predicted_y
        row_length = 0

        if list_dataset is not None:
            row_length, feature_length = np.shape(list_dataset)
            if row_length > 100000:
                raise ValueError("Exceeds maximum number of rows"
                                 "for visualization (100000)")
            if feature_length > 1000:
                raise ValueError("Exceeds maximum number of features for"
                                 " visualization (1000). Please regenerate the"
                                 " explanation using fewer features or"
                                 " initialize the dashboard without passing a"
                                 " dataset.")
            dashboard_dataset.features = list_dataset

        true_y = self.test[self.target_column]

        if true_y is not None and len(true_y) == row_length:
            if (self.task_type == "classification"
                    and dashboard_dataset.class_names is not None):
                true_y = [
                    dashboard_dataset.class_names.index(y) for y in true_y
                ]
            dashboard_dataset.true_y = convert_to_list(true_y)

        features = dataset.columns

        if features is not None:
            features = convert_to_list(features)
            if feature_length is not None and len(features) != feature_length:
                raise ValueError("Feature vector length mismatch:"
                                 " feature names length differs"
                                 " from local explanations dimension")
            dashboard_dataset.feature_names = features
        dashboard_dataset.target_column = self.target_column
        if is_classifier(self.model) and dataset is not None:
            try:
                probability_y = self.model.predict_proba(dataset)
            except Exception as ex:
                raise ValueError("Model does not support predict_proba method"
                                 " for given dataset type,") from ex
            try:
                probability_y = convert_to_list(probability_y)
            except Exception as ex:
                raise ValueError(
                    "Model predict_proba output of unsupported type,") from ex
            dashboard_dataset.probability_y = probability_y

        return dashboard_dataset

    def _save_predictions(self, path):
        """Save the predict() and predict_proba() output.

        :param path: The directory path to save the RAIInsights to.
        :type path: str
        """
        prediction_output_path = Path(path) / _PREDICTIONS
        prediction_output_path.mkdir(parents=True, exist_ok=True)

        if self.model is None:
            return

        test_without_target_column = self.test.drop([self.target_column],
                                                    axis=1)

        predict_output = self.model.predict(test_without_target_column)
        self._write_to_file(
            prediction_output_path / (_PREDICT + _JSON_EXTENSION),
            json.dumps(predict_output.tolist()))

        if hasattr(self.model, SKLearn.PREDICT_PROBA):
            predict_proba_output = self.model.predict_proba(
                test_without_target_column)
            self._write_to_file(
                prediction_output_path / (_PREDICT_PROBA + _JSON_EXTENSION),
                json.dumps(predict_proba_output.tolist()))

    def _save_metadata(self, path):
        """Save the metadata like target column, categorical features,
           task type and the classes (if any).

        :param path: The directory path to save the RAIInsights to.
        :type path: str
        """
        top_dir = Path(path)
        classes = convert_to_list(self._classes)
        feature_metadata_dict = None
        if self._feature_metadata is not None:
            feature_metadata_dict = self._feature_metadata.to_dict()
        meta = {
            _TARGET_COLUMN: self.target_column,
            _TASK_TYPE: self.task_type,
            _CATEGORICAL_FEATURES: self.categorical_features,
            _CLASSES: classes,
            _FEATURE_COLUMNS: self._feature_columns,
            _FEATURE_RANGES: self._feature_ranges,
            _FEATURE_METADATA: feature_metadata_dict
        }
        with open(top_dir / _META_JSON, 'w') as file:
            json.dump(meta, file)

    @staticmethod
    def _get_feature_ranges(test, categorical_features, feature_columns):
        """Get feature ranges like min, max and unique values
        for all columns"""
        result = []
        for col in feature_columns:
            res_object = {}
            if (col in categorical_features):
                unique_value = test[col].unique()
                res_object[_COLUMN_NAME] = col
                res_object[_RANGE_TYPE] = "categorical"
                res_object[_UNIQUE_VALUES] = unique_value.tolist()
            else:
                min_value = float(test[col].min())
                max_value = float(test[col].max())
                res_object[_COLUMN_NAME] = col
                res_object[_RANGE_TYPE] = "integer"
                res_object[_MIN_VALUE] = min_value
                res_object[_MAX_VALUE] = max_value
            result.append(res_object)
        return result

    @staticmethod
    def _load_metadata(inst, path):
        """Load the metadata.

        :param inst: RAIInsights object instance.
        :type inst: RAIInsights
        :param path: The directory path to metadata location.
        :type path: str
        """
        top_dir = Path(path)
        with open(top_dir / _META_JSON, 'r') as meta_file:
            meta = meta_file.read()
        meta = json.loads(meta)
        inst.__dict__[_TARGET_COLUMN] = meta[_TARGET_COLUMN]
        inst.__dict__[_TASK_TYPE] = meta[_TASK_TYPE]
        inst.__dict__[_CATEGORICAL_FEATURES] = meta[_CATEGORICAL_FEATURES]
        classes = None
        if _TRAIN_LABELS in meta:
            classes = meta[_TRAIN_LABELS]
        else:
            classes = meta[_CLASSES]

        inst.__dict__['_' + _CLASSES] = RAIInsights._get_classes(
            task_type=meta[_TASK_TYPE],
            train=inst.__dict__[_TRAIN],
            target_column=meta[_TARGET_COLUMN],
            classes=classes)

        inst.__dict__['_' + _FEATURE_COLUMNS] = meta[_FEATURE_COLUMNS]
        inst.__dict__['_' + _FEATURE_RANGES] = meta[_FEATURE_RANGES]
        if meta[_FEATURE_METADATA] is None:
            inst.__dict__['_' + _FEATURE_METADATA] = None
        else:
            inst.__dict__['_' + _FEATURE_METADATA] = FeatureMetadata(
                identity_feature_name=meta[_FEATURE_METADATA]
                ['identity_feature_name'],
                datetime_features=meta[_FEATURE_METADATA]['datetime_features'],
                categorical_features=meta[_FEATURE_METADATA]
                ['categorical_features'],
                dropped_features=meta[_FEATURE_METADATA]['dropped_features'],
            )

        inst.__dict__['_' + _CATEGORIES], \
            inst.__dict__['_' + _CATEGORICAL_INDEXES], \
            inst.__dict__['_' + _CATEGORY_DICTIONARY], \
            inst.__dict__['_' + _STRING_IND_DATA] = \
            process_categoricals(
                all_feature_names=inst.__dict__['_' + _FEATURE_COLUMNS],
                categorical_features=inst.__dict__[_CATEGORICAL_FEATURES],
                dataset=inst.__dict__[_TEST].drop(columns=[
                    inst.__dict__[_TARGET_COLUMN]]))

    @staticmethod
    def load(path):
        """Load the RAIInsights from the given path.

        :param path: The directory path to load the RAIInsights from.
        :type path: str
        :return: The RAIInsights object after loading.
        :rtype: RAIInsights
        """
        # create the RAIInsights without any properties using the __new__
        # function, similar to pickle
        inst = RAIInsights.__new__(RAIInsights)

        manager_map = {
            ManagerNames.CAUSAL: CausalManager,
            ManagerNames.COUNTERFACTUAL: CounterfactualManager,
            ManagerNames.DATA_BALANCE: DataBalanceManager,
            ManagerNames.ERROR_ANALYSIS: ErrorAnalysisManager,
            ManagerNames.EXPLAINER: ExplainerManager,
        }

        # load current state
        RAIBaseInsights._load(path, inst, manager_map,
                              RAIInsights._load_metadata)

        return inst