def test_serialize_json_safe_basic(self): values = [0, 1, 2, 3, 4, 5] result = serialize_json_safe(values) assert result == [0, 1, 2, 3, 4, 5] values = ['a', 'b', 'a', 'c', 'a', 'b'] result = serialize_json_safe(values) assert result == ['a', 'b', 'a', 'c', 'a', 'b']
def test_serialize_via_json_timestamp(self): timestamp_obj = pd.Timestamp(2020, 1, 1) assert isinstance(timestamp_obj, pd.Timestamp) result = json.dumps(serialize_json_safe(timestamp_obj)) assert result is not None assert "2020" in result timestamp_obj_array = np.array([pd.Timestamp(2020, 1, 1)]) result = json.dumps(serialize_json_safe(timestamp_obj_array)) assert result is not None assert "2020" in result
def test_serialize_json_safe_aggregate_types(self): o = {'a': [1, 2, 3], 'c': 'b'} result = serialize_json_safe(o) assert result == o o = ('a', [1, 2, 3]) result = serialize_json_safe(o) assert result == o values = np.array([[1, 2, 3], [4, 5, 6]]) result = serialize_json_safe(values) assert result == values.tolist()
def test_serialize_json_safe_missing(self): values = [0, np.nan, 2, 3, 4, 5] result = serialize_json_safe(values) assert result == [0, 0, 2, 3, 4, 5] values = [0, np.inf, 2, 3, 4, 5] result = serialize_json_safe(values) assert result == [0, 0, 2, 3, 4, 5] values = ['a', 'b', 'a', np.nan, 'a', 'b'] result = serialize_json_safe(values) assert result == ['a', 'b', 'a', 0, 'a', 'b']
def _get_dashboard_data(self): """Get the Python dict representation of the dashboard object.""" if self._dashboard_data is None: dashboard_object = self._get_dashboard_object() self._dashboard_data = serialize_json_safe(dashboard_object) return self._dashboard_data
def test_embedded_object(self): class A: def __init__(self): self.a_data = 'a' class B: def __init__(self): self.b_data = A() result = serialize_json_safe({'B': B()}) assert result == {'B': {'b_data': {'a_data': 'a'}}}
def __init__(self, explanation, model, dataset, true_y, classes, features): """Initialize the Explanation Dashboard Input. :param explanation: An object that represents an explanation. :type explanation: ExplanationMixin :param model: An object that represents a model. It is assumed that for the classification case it has a method of predict_proba() returning the prediction probabilities for each class and for the regression case a method of predict() returning the prediction value. :type model: object :param dataset: A matrix of feature vector examples (# examples x # features), the same samples used to build the explanation. Will overwrite any set on explanation object already. Must have fewer than 10000 rows and fewer than 1000 columns. :type dataset: numpy.array or list[][] :param true_y: The true labels for the provided dataset. Will overwrite any set on explanation object already. :type true_y: numpy.array or list[] :param classes: The class names. :type classes: numpy.array or list[] :param features: Feature names. :type features: numpy.array or list[] """ self._model = model self._is_classifier = model is not None\ and hasattr(model, SKLearn.PREDICT_PROBA) and \ model.predict_proba is not None self._dataframeColumns = None self.dashboard_input = {} # List of explanations, key of explanation type is "explanation_type" if explanation is not None: self._mli_explanations = explanation.data(-1)["mli"] else: self._mli_explanations = None local_explanation = self._find_first_explanation( ExplanationDashboardInterface.MLI_LOCAL_EXPLANATION_KEY) global_explanation = self._find_first_explanation( ExplanationDashboardInterface.MLI_GLOBAL_EXPLANATION_KEY) ebm_explanation = self._find_first_explanation( ExplanationDashboardInterface.MLI_EBM_GLOBAL_EXPLANATION_KEY) dataset_explanation = self._find_first_explanation( ExplanationDashboardInterface.MLI_EXPLANATION_DATASET_KEY) if explanation is not None and hasattr(explanation, 'method'): self.dashboard_input[ExplanationDashboardInterface. EXPLANATION_METHOD] = explanation.method predicted_y = None feature_length = None if dataset_explanation is not None: if dataset is None: dataset = dataset_explanation[ ExplanationDashboardInterface.MLI_DATASET_X_KEY] if true_y is None: true_y = dataset_explanation[ ExplanationDashboardInterface.MLI_DATASET_Y_KEY] if isinstance(dataset, pd.DataFrame) and hasattr(dataset, 'columns'): self._dataframeColumns = dataset.columns self._dfdtypes = dataset.dtypes try: list_dataset = self._convert_to_list(dataset) except Exception as ex: ex_str = _format_exception(ex) raise ValueError( "Unsupported dataset type, inner error: {}".format(ex_str)) if dataset is not None and model is not None: try: predicted_y = model.predict(dataset) except Exception as ex: ex_str = _format_exception(ex) msg = "Model does not support predict method for given" "dataset type, inner error: {}".format(ex_str) raise ValueError(msg) try: predicted_y = self._convert_to_list(predicted_y) except Exception as ex: ex_str = _format_exception(ex) raise ValueError("Model prediction output of unsupported type," "inner error: {}".format(ex_str)) if predicted_y is not None: self.dashboard_input[ ExplanationDashboardInterface.PREDICTED_Y] = predicted_y row_length = 0 if list_dataset is not None: row_length, feature_length = np.shape(list_dataset) if row_length > 100000: raise ValueError("Exceeds maximum number of rows" "for visualization (100000)") if feature_length > 1000: raise ValueError("Exceeds maximum number of features for" " visualization (1000). Please regenerate the" " explanation using fewer features or" " initialize the dashboard without passing a" " dataset.") self.dashboard_input[ExplanationDashboardInterface. TRAINING_DATA] = serialize_json_safe( list_dataset) self.dashboard_input[ExplanationDashboardInterface. IS_CLASSIFIER] = self._is_classifier local_dim = None if true_y is not None and len(true_y) == row_length: self.dashboard_input[ExplanationDashboardInterface. TRUE_Y] = self._convert_to_list(true_y) if local_explanation is not None: try: local_explanation["scores"] = self._convert_to_list( local_explanation["scores"]) if np.shape(local_explanation["scores"])[-1] > 1000: raise ValueError("Exceeds maximum number of features for " "visualization (1000). Please regenerate" " the explanation using fewer features.") local_explanation["intercept"] = self._convert_to_list( local_explanation["intercept"]) # We can ignore perf explanation data. # Note if it is added back at any point, # the numpy values will need to be converted to python, # otherwise serialization fails. local_explanation["perf"] = None self.dashboard_input[ExplanationDashboardInterface. LOCAL_EXPLANATIONS] = local_explanation except Exception as ex: ex_str = _format_exception(ex) raise ValueError("Unsupported local explanation type," "inner error: {}".format(ex_str)) if list_dataset is not None: local_dim = np.shape(local_explanation["scores"]) if len(local_dim) != 2 and len(local_dim) != 3: raise ValueError( "Local explanation expected to be a 2D or 3D list") if len(local_dim) == 2 and (local_dim[1] != feature_length or local_dim[0] != row_length): raise ValueError("Shape mismatch: local explanation" "length differs from dataset") if len(local_dim) == 3 and (local_dim[2] != feature_length or local_dim[1] != row_length): raise ValueError("Shape mismatch: local explanation" " length differs from dataset") if local_explanation is None and global_explanation is not None: try: global_explanation["scores"] = self._convert_to_list( global_explanation["scores"]) if 'intercept' in global_explanation: global_explanation["intercept"] = self._convert_to_list( global_explanation["intercept"]) self.dashboard_input[ExplanationDashboardInterface. GLOBAL_EXPLANATION] = global_explanation except Exception as ex: ex_str = _format_exception(ex) raise ValueError("Unsupported global explanation type," "inner error: {}".format(ex_str)) if ebm_explanation is not None: try: self.dashboard_input[ExplanationDashboardInterface. EBM_EXPLANATION] = ebm_explanation except Exception as ex: ex_str = _format_exception(ex) raise ValueError( "Unsupported ebm explanation type: {}".format(ex_str)) if features is None\ and explanation is not None\ and hasattr(explanation, 'features')\ and explanation.features is not None: features = explanation.features if features is not None: features = self._convert_to_list(features) if feature_length is not None and len(features) != feature_length: raise ValueError("Feature vector length mismatch:" " feature names length differs" " from local explanations dimension") self.dashboard_input[ ExplanationDashboardInterface.FEATURE_NAMES] = features if classes is None\ and explanation is not None\ and hasattr(explanation, 'classes')\ and explanation.classes is not None: classes = explanation.classes if classes is not None: classes = self._convert_to_list(classes) if local_dim is not None and len(classes) != local_dim[0]: raise ValueError("Class vector length mismatch:" "class names length differs from" "local explanations dimension") self.dashboard_input[ ExplanationDashboardInterface.CLASS_NAMES] = classes if model is not None and hasattr(model, SKLearn.PREDICT_PROBA) \ and model.predict_proba is not None and dataset is not None: try: probability_y = model.predict_proba(dataset) except Exception as ex: ex_str = _format_exception(ex) raise ValueError("Model does not support predict_proba method" " for given dataset type," " inner error: {}".format(ex_str)) try: probability_y = self._convert_to_list(probability_y) except Exception as ex: ex_str = _format_exception(ex) raise ValueError( "Model predict_proba output of unsupported type," "inner error: {}".format(ex_str)) self.dashboard_input[ ExplanationDashboardInterface.PROBABILITY_Y] = probability_y
def __init__(self, explanation, model, dataset, true_y, classes, features, categorical_features, true_y_dataset, pred_y, model_task, metric, max_depth, num_leaves): """Initialize the ErrorAnalysis Dashboard Input. :param explanation: An object that represents an explanation. :type explanation: ExplanationMixin :param model: An object that represents a model. It is assumed that for the classification case it has a method of predict_proba() returning the prediction probabilities for each class and for the regression case a method of predict() returning the prediction value. :type model: object :param dataset: A matrix of feature vector examples (# examples x # features), the same samples used to build the explanation. Will overwrite any set on explanation object already. Must have fewer than 10000 rows and fewer than 1000 columns. :type dataset: numpy.array or list[][] or pandas.DataFrame :param true_y: The true labels for the provided explanation. Will overwrite any set on explanation object already. :type true_y: numpy.array or list[] :param classes: The class names. :type classes: numpy.array or list[] :param features: Feature names. :type features: numpy.array or list[] :param categorical_features: The categorical feature names. :type categorical_features: list[str] :param true_y_dataset: The true labels for the provided dataset. Only needed if the explanation has a sample of instances from the original dataset. Otherwise specify true_y parameter only. :type true_y_dataset: numpy.array or list[] :param pred_y: The predicted y values, can be passed in as an alternative to the model and explanation for a more limited view. :type pred_y: numpy.ndarray or list[] :param model_task: Optional parameter to specify whether the model is a classification or regression model. In most cases, the type of the model can be inferred based on the shape of the output, where a classifier has a predict_proba method and outputs a 2 dimensional array, while a regressor has a predict method and outputs a 1 dimensional array. :type model_task: str :param metric: The metric name to evaluate at each tree node or heatmap grid. Currently supported classification metrics include 'error_rate', 'recall_score', 'precision_score', 'f1_score', and 'accuracy_score'. Supported regression metrics include 'mean_absolute_error', 'mean_squared_error', 'r2_score', and 'median_absolute_error'. :type metric: str :param max_depth: The maximum depth of the surrogate tree trained on errors. :type max_depth: int :param num_leaves: The number of leaves of the surrogate tree trained on errors. :type num_leaves: int """ self._model = model full_dataset = dataset if true_y_dataset is None: full_true_y = true_y else: full_true_y = true_y_dataset self._categorical_features = categorical_features self._string_ind_data = None self._categories = [] self._categorical_indexes = [] self._is_classifier = model is not None\ and hasattr(model, SKLearn.PREDICT_PROBA) and \ model.predict_proba is not None self._dataframeColumns = None self.dashboard_input = {} has_explanation = explanation is not None feature_length = None self._max_depth = max_depth self._num_leaves = num_leaves if has_explanation: if classes is None: has_classes_attr = hasattr(explanation, 'classes') if has_classes_attr and explanation.classes is not None: classes = explanation.classes dataset, true_y = self.input_explanation(explanation, dataset, true_y) row_length = len(dataset) # Only check dataset on explanation for row length bounds if row_length > 100000: raise ValueError("Exceeds maximum number of rows" "for visualization (100000)") if classes is not None: classes = self._convert_to_list(classes) self.dashboard_input[ ExplanationDashboardInterface.CLASS_NAMES] = classes class_to_index = {k: v for v, k in enumerate(classes)} if isinstance(dataset, pd.DataFrame) and hasattr(dataset, 'columns'): self._dataframeColumns = dataset.columns self._dfdtypes = dataset.dtypes try: list_dataset = self._convert_to_list(dataset) except Exception as ex: ex_str = _format_exception(ex) raise ValueError( "Unsupported dataset type, inner error: {}".format(ex_str)) if has_explanation: self.input_explanation_data(explanation, list_dataset, classes) if features is None and hasattr(explanation, 'features'): features = explanation.features model_available = model is not None if model_available and pred_y is not None: raise ValueError('Only model or pred_y can be specified, not both') self.dashboard_input[ENABLE_PREDICT] = model_available if model_available: predicted_y = self.compute_predicted_y(model, dataset) else: predicted_y = self.predicted_y_to_list(pred_y) if predicted_y is not None: # If classes specified, convert predicted_y to # numeric representation if classes is not None and predicted_y[0] in class_to_index: for i in range(len(predicted_y)): predicted_y[i] = class_to_index[predicted_y[i]] self.dashboard_input[ ExplanationDashboardInterface.PREDICTED_Y] = predicted_y row_length = 0 if list_dataset is not None: row_length, feature_length = np.shape(list_dataset) if feature_length > 1000: raise ValueError("Exceeds maximum number of features for" " visualization (1000). Please regenerate the" " explanation using fewer features or" " initialize the dashboard without passing a" " dataset.") self.dashboard_input[ExplanationDashboardInterface. TRAINING_DATA] = serialize_json_safe( list_dataset) self.dashboard_input[ExplanationDashboardInterface. IS_CLASSIFIER] = self._is_classifier if true_y is not None and len(true_y) == row_length: list_true_y = self._convert_to_list(true_y) # If classes specified, convert true_y to numeric representation if classes is not None and list_true_y[0] in class_to_index: for i in range(len(list_true_y)): list_true_y[i] = class_to_index[list_true_y[i]] self.dashboard_input[ ExplanationDashboardInterface.TRUE_Y] = list_true_y if features is not None: features = self._convert_to_list(features) if feature_length is not None and len(features) != feature_length: raise ValueError("Feature vector length mismatch:" " feature names length differs" " from local explanations dimension") self.dashboard_input[FEATURE_NAMES] = features if model_available and hasattr(model, SKLearn.PREDICT_PROBA) \ and model.predict_proba is not None and dataset is not None: try: probability_y = model.predict_proba(dataset) except Exception as ex: ex_str = _format_exception(ex) raise ValueError("Model does not support predict_proba method" " for given dataset type," " inner error: {}".format(ex_str)) try: probability_y = self._convert_to_list(probability_y) except Exception as ex: ex_str = _format_exception(ex) raise ValueError( "Model predict_proba output of unsupported type," "inner error: {}".format(ex_str)) self.dashboard_input[ ExplanationDashboardInterface.PROBABILITY_Y] = probability_y if model_available: self._error_analyzer = ModelAnalyzer(model, full_dataset, full_true_y, features, categorical_features, model_task, metric) else: # Model task cannot be unknown when passing predictions # Assume classification for backwards compatibility if model_task == ModelTask.UNKNOWN: model_task = ModelTask.CLASSIFICATION self._error_analyzer = PredictionsAnalyzer(pred_y, full_dataset, full_true_y, features, categorical_features, model_task, metric) if self._categorical_features: self.dashboard_input[ ExplanationDashboardInterface. CATEGORICAL_MAP] = self._error_analyzer.category_dictionary # Compute metrics on all data cohort if self._error_analyzer.model_task == ModelTask.CLASSIFICATION: if self._error_analyzer.metric is None: metric = Metrics.ERROR_RATE else: metric = self._error_analyzer.metric else: if self._error_analyzer.metric is None: metric = Metrics.MEAN_SQUARED_ERROR else: metric = self._error_analyzer.metric if model_available and true_y_dataset is not None: full_predicted_y = self.compute_predicted_y(model, full_dataset) else: full_predicted_y = predicted_y self.set_root_metric(full_predicted_y, full_true_y, metric)
def test_unknown(self): c = complex(1, 2) result = serialize_json_safe([c, 42]) assert result == [c, 42]
def test_numpy(self): result = serialize_json_safe(np.array([1, 2, 3])) assert result == [1, 2, 3]
def test_serialize_timestamp(self): datetime_str = "2020-10-10" datetime_object = datetime.datetime.strptime(datetime_str, "%Y-%m-%d") result = serialize_json_safe(datetime_object) assert datetime_str in result