Ejemplo n.º 1
0
    def _add_filter_cols(self, df, filters):
        """Adds special columns to the dataset for filtering and postprocessing.

        :param df: The dataset as a pandas dataframe.
        :type df: pandas.DataFrame
        :param filters: The filters.
        :type filters: list[dict]
        """
        has_classification_outcome = self._filters_has_classification_outcome(
            filters)
        has_regression_error = self._filters_has_regression_error(filters)

        if isinstance(self.true_y, str):
            df.rename(columns={self.true_y: TRUE_Y})
        else:
            df[TRUE_Y] = self.true_y

        if self.model is None:
            df[PRED_Y] = self.pred_y

        if not is_spark(df):
            df[ROW_INDEX] = np.arange(0, len(self.true_y))
        if has_classification_outcome:
            if PRED_Y in df:
                pred_y = df[PRED_Y]
            else:
                # calculate directly via prediction on model
                pred_y = self.model.predict(
                    df.drop(columns=[TRUE_Y, ROW_INDEX]))

            classes = get_ordered_classes(
                self.classes, self.true_y, pred_y)

            # calculate classification outcome and add to df
            if len(classes) == 2:
                df[CLASSIFICATION_OUTCOME] = \
                    self._compute_binary_classification_outcome_data(
                        self.true_y, pred_y, classes)
            else:
                df[CLASSIFICATION_OUTCOME] = \
                    self._compute_multiclass_classification_outcome_data(
                        self.true_y, pred_y)
        elif has_regression_error:
            if PRED_Y in df:
                pred_y = df[PRED_Y]
            else:
                # calculate directly via prediction on model
                pred_y = self.model.predict(
                    df.drop(columns=[TRUE_Y, ROW_INDEX]))
            # calculate regression error and add to df
            df[REGRESSION_ERROR] = self._compute_regression_error_data(
                self.true_y, pred_y)
def add_filter_cols(analyzer, df, filters, true_y):
    """Adds special columns to the dataset for filtering and postprocessing.

    :param analyzer: The error analyzer.
    :type: BaseAnalyzer
    :param filters: The filters.
    :type filters: list[dict]
    :param true_y: The true labels.
    :type true_y: list
    """
    has_classification_outcome = filters_has_classification_outcome(
        analyzer, filters)
    if isinstance(true_y, str):
        df.rename(columns={true_y: TRUE_Y})
    else:
        df[TRUE_Y] = true_y
    is_model_analyzer = hasattr(analyzer, MODEL)
    if not is_model_analyzer:
        df[PRED_Y] = analyzer.pred_y
    if not is_spark(df):
        df[ROW_INDEX] = np.arange(0, len(true_y))
    if has_classification_outcome:
        if PRED_Y in df:
            pred_y = df[PRED_Y]
        else:
            # calculate directly via prediction on model
            pred_y = analyzer.model.predict(
                df.drop(columns=[TRUE_Y, ROW_INDEX]))
        classes = get_ordered_classes(analyzer.classes, true_y, pred_y)
        # calculate classification outcome and add to df
        classification_outcome = []
        if not isinstance(pred_y, np.ndarray):
            pred_y = np.array(pred_y)
        if not isinstance(true_y, np.ndarray):
            true_y = np.array(true_y)
        for i in range(len(true_y)):
            if true_y[i] == pred_y[i]:
                if true_y[i] == classes[0]:
                    # True negative == 0
                    classification_outcome.append(3)
                else:
                    # True positive == 3
                    classification_outcome.append(3)
            else:
                if true_y[i] == classes[0]:
                    # False negative == 2
                    classification_outcome.append(2)
                else:
                    # False positive == 1
                    classification_outcome.append(1)
        df[CLASSIFICATION_OUTCOME] = classification_outcome
Ejemplo n.º 3
0
def get_expected_metric_error(error_analyzer, metric, model, validation_data,
                              y_test):
    if metric == Metrics.ERROR_RATE:
        return sum(model.predict(validation_data) != y_test)
    elif (metric == Metrics.MEAN_SQUARED_ERROR
          or metric == Metrics.MEAN_ABSOLUTE_ERROR
          or metric in precision_metrics or metric in recall_metrics
          or metric in f1_metrics or metric == Metrics.ACCURACY_SCORE):
        func = metric_to_func[metric]
        pred_y = model.predict(validation_data)
        can_be_binary = error_analyzer.model_task == ModelTask.CLASSIFICATION
        if can_be_binary and metric != Metrics.ACCURACY_SCORE:
            ordered_labels = get_ordered_classes(error_analyzer.classes,
                                                 y_test, pred_y)
            if len(ordered_labels) == 2:
                return func(y_test, pred_y, pos_label=ordered_labels[1])
        return func(y_test, pred_y)
    else:
        raise NotImplementedError(
            "Metric {} validation not supported yet".format(metric))
Ejemplo n.º 4
0
def compute_metric_value(func, classes, true_y, pred_y, metric):
    """Compute metric from the given function, true and predicted values.

    :param func: The metric function to evaluate.
    :type func: function
    :param classes: List of classes.
    :type classes: list
    :param true_y: True y values.
    :type true_y: numpy.ndarray
    :param pred_y: Predicted y values.
    :type pred_y: numpy.ndarray
    :param metric: Metric to compute.
    :type metric: str
    :return: The computed metric value.
    :rtype: float
    """
    requires_pos_label = (metric == Metrics.RECALL_SCORE or
                          metric == Metrics.PRECISION_SCORE or
                          metric == Metrics.F1_SCORE)
    if requires_pos_label:
        ordered_labels = get_ordered_classes(classes, true_y, pred_y)
        if ordered_labels is not None and len(ordered_labels) == 2:
            return func(true_y, pred_y, pos_label=ordered_labels[1])
    return func(true_y, pred_y)
Ejemplo n.º 5
0
def compute_matrix(analyzer,
                   features,
                   filters,
                   composite_filters,
                   quantile_binning=False,
                   num_bins=BIN_THRESHOLD):
    if num_bins <= 0:
        raise ValueError(
            'Number of bins parameter must be greater than 0 for the heatmap')
    if features[0] is None and features[1] is None:
        raise ValueError(
            'One or two features must be specified to compute the heat map')
    filtered_df = filter_from_cohort(analyzer, filters, composite_filters)
    true_y = filtered_df[TRUE_Y]
    dropped_cols = [TRUE_Y, ROW_INDEX]
    is_model_analyzer = hasattr(analyzer, 'model')
    if not is_model_analyzer:
        pred_y = filtered_df[PRED_Y]
        dropped_cols.append(PRED_Y)
    input_data = filtered_df.drop(columns=dropped_cols)
    is_pandas = isinstance(analyzer.dataset, pd.DataFrame)
    metric = analyzer.metric
    if is_pandas:
        true_y = true_y.to_numpy()
    else:
        input_data = input_data.to_numpy()
    if is_model_analyzer:
        pred_y = analyzer.model.predict(input_data)
    if is_model_analyzer:
        if analyzer.model_task == ModelTask.CLASSIFICATION:
            diff = analyzer.model.predict(input_data) != true_y
        else:
            diff = analyzer.model.predict(input_data) - true_y
    else:
        if analyzer.model_task == ModelTask.CLASSIFICATION:
            diff = pred_y != true_y
        else:
            diff = pred_y - true_y
    if not isinstance(diff, np.ndarray):
        diff = np.array(diff)
    if not isinstance(pred_y, np.ndarray):
        pred_y = np.array(pred_y)
    if not isinstance(true_y, np.ndarray):
        true_y = np.array(true_y)
    indexes = []
    for feature in features:
        if feature is None:
            continue
        indexes.append(analyzer.feature_names.index(feature))
    if is_pandas:
        input_data = input_data.to_numpy()
    dataset_sub_features = input_data[:, indexes]
    dataset_sub_names = np.array(analyzer.feature_names)[np.array(indexes)]
    df = pd.DataFrame(dataset_sub_features, columns=dataset_sub_names)
    df_err = df.copy()
    df_err[DIFF] = diff
    if metric == Metrics.ERROR_RATE:
        df_err = df_err[df_err[DIFF]]
    else:
        df_err[TRUE_Y] = true_y
        df_err[PRED_Y] = pred_y
    # construct matrix
    matrix = []
    if len(dataset_sub_names) == 2:
        feat1 = dataset_sub_names[0]
        feat2 = dataset_sub_names[1]
        unique_count1 = len(df[feat1].unique())
        unique_count2 = len(df[feat2].unique())
        f1_is_cat = False
        f2_is_cat = False
        if analyzer.categorical_features is not None:
            f1_is_cat = feat1 in analyzer.categorical_features
            f2_is_cat = feat2 in analyzer.categorical_features
        if unique_count1 > num_bins and not f1_is_cat:
            tabdf1 = bin_data(df,
                              feat1,
                              num_bins,
                              quantile_binning=quantile_binning)
            categories1 = tabdf1.cat.categories
            if len(categories1) < num_bins:
                warn_duplicate_edges(feat1)
            tabdf1_err = bin_data(df_err,
                                  feat1,
                                  categories1,
                                  quantile_binning=quantile_binning)
        else:
            tabdf1 = df[feat1]
            tabdf1_err = df_err[feat1]
            categories1 = np.unique(tabdf1.to_numpy(), return_counts=True)[0]
        if unique_count2 > num_bins and not f2_is_cat:
            tabdf2 = bin_data(df,
                              feat2,
                              num_bins,
                              quantile_binning=quantile_binning)
            categories2 = tabdf2.cat.categories
            if len(categories2) < num_bins:
                warn_duplicate_edges(feat2)
            tabdf2_err = bin_data(df_err,
                                  feat2,
                                  categories2,
                                  quantile_binning=quantile_binning)
        else:
            tabdf2 = df[feat2]
            tabdf2_err = df_err[feat2]
            categories2 = np.unique(tabdf2.to_numpy(), return_counts=True)[0]
        if metric == Metrics.ERROR_RATE:
            matrix_total = pd.crosstab(tabdf1,
                                       tabdf2,
                                       rownames=[feat1],
                                       colnames=[feat2])
            matrix_error = pd.crosstab(tabdf1_err,
                                       tabdf2_err,
                                       rownames=[feat1],
                                       colnames=[feat2])
        else:
            if is_multi_agg_metric(metric):
                ordered_labels = get_ordered_classes(analyzer.classes, true_y,
                                                     pred_y)
                aggfunc = _MultiMetricAggFunc(metric_to_func[metric],
                                              ordered_labels, metric)
            else:
                aggfunc = _AggFunc(metric_to_func[metric])
            matrix_total = pd.crosstab(tabdf1,
                                       tabdf2,
                                       rownames=[feat1],
                                       colnames=[feat2])
            matrix_error = pd.crosstab(tabdf1_err,
                                       tabdf2_err,
                                       rownames=[feat1],
                                       colnames=[feat2],
                                       values=list(
                                           zip(df_err[TRUE_Y],
                                               df_err[PRED_Y])),
                                       aggfunc=aggfunc._agg_func_pair)
            fill_matrix_nulls(matrix_total, aggfunc._fill_na_value())
            fill_matrix_nulls(matrix_error, aggfunc._fill_na_value())
        matrix = matrix_2d(categories1, categories2, matrix_total,
                           matrix_error, metric)
    else:
        feat1 = dataset_sub_names[0]
        unique_count1 = len(df[feat1].unique())
        f1_is_cat = False
        if analyzer.categorical_features is not None:
            f1_is_cat = feat1 in analyzer.categorical_features
        if unique_count1 > num_bins and not f1_is_cat:
            cutdf = bin_data(df,
                             feat1,
                             num_bins,
                             quantile_binning=quantile_binning)
            num_categories = len(cutdf.cat.categories)
            bin_range = range(num_categories)
            if len(cutdf.cat.categories) < num_bins:
                warn_duplicate_edges(feat1)
            catr = cutdf.cat.rename_categories(bin_range)
            catn, counts = np.unique(catr.to_numpy(), return_counts=True)
            # fix counts to include skipped categories
            fix_counts = []
            counts_idx = 0
            for idx, catdf in enumerate(cutdf.cat.categories):
                if idx not in catn:
                    fix_counts.append(0)
                else:
                    fix_counts.append(counts[counts_idx])
                    counts_idx += 1
            counts = fix_counts
            cut_err = bin_data(df_err,
                               feat1,
                               cutdf.cat.categories,
                               quantile_binning=quantile_binning)
            catr_err = cut_err.cat.rename_categories(bin_range)
            val_err, counts_err = np.unique(catr_err.to_numpy(),
                                            return_counts=True)
            val_err = cut_err.cat.categories[val_err]
            categories = cutdf.cat.categories
        else:
            categories, counts = np.unique(df[feat1].to_numpy(),
                                           return_counts=True)
            val_err, counts_err = np.unique(df_err[feat1].to_numpy(),
                                            return_counts=True)
            cut_err = df_err
        # Compute the given metric for each group, if not using error rate
        if metric != Metrics.ERROR_RATE:
            if is_multi_agg_metric(metric):
                ordered_labels = get_ordered_classes(analyzer.classes, true_y,
                                                     pred_y)
                aggfunc = _MultiMetricAggFunc(metric_to_func[metric],
                                              ordered_labels, metric)
            else:
                aggfunc = _AggFunc(metric_to_func[metric])
            cutdf_err = pd.DataFrame(cut_err)
            cutdf_err['metric_values'] = list(
                zip(df_err[TRUE_Y], df_err[PRED_Y]))
            grouped = cutdf_err.groupby([feat1])
            agg_func = {'metric_values': aggfunc._agg_func_triplet}
            counts_err = grouped.agg(agg_func)
            counts_err = counts_err.values.ravel()
        matrix = matrix_1d(categories, val_err, counts, counts_err, metric)
    return matrix
Ejemplo n.º 6
0
def compute_matrix(analyzer,
                   features,
                   filters,
                   composite_filters,
                   quantile_binning=False,
                   num_bins=BIN_THRESHOLD):
    """Compute a matrix of metrics for a given set of feature names.

    The filters and composite filters are used to filter the data
    prior to computing the matrix.

    :param analyzer: The error analyzer.
    :type analyzer: BaseAnalyzer
    :param features: A list of one or two feature names to compute metrics for.
    :type features: list
    :param filters: A list of filters to apply to the data.
    :type filters: list
    :param composite_filters: A list of composite filters to apply to the data.
    :type composite_filters: list
    :param quantile_binning: Whether to use quantile binning.
    :type quantile_binning: bool
    :param num_bins: The number of bins to use for quantile binning.
    :type num_bins: int
    :return: A dictionary representation of the computed matrix which can be
        saved to JSON.
    :rtype: dict

    :Example:

    An example of running compute_matrix with a filter and a composite
    filter:

    >>> from erroranalysis._internal.error_analyzer import ModelAnalyzer
    >>> from erroranalysis._internal.matrix_filter import (
    ...     compute_matrix)
    >>> from erroranalysis._internal.constants import ModelTask
    >>> from sklearn.datasets import load_breast_cancer
    >>> from sklearn.model_selection import train_test_split
    >>> from sklearn import svm
    >>> breast_cancer_data = load_breast_cancer()
    >>> feature_names = breast_cancer_data.feature_names
    >>> X_train, X_test, y_train, y_test = train_test_split(
    ...     breast_cancer_data.data, breast_cancer_data.target,
    ...     test_size=0.5, random_state=0)
    >>> categorical_features = []
    >>> clf = svm.SVC(gamma=0.001, C=100., probability=True,
    ...               random_state=777)
    >>> model = clf.fit(X_train, y_train)
    >>> model_task = ModelTask.CLASSIFICATION
    >>> analyzer = ModelAnalyzer(model, X_test, y_test, feature_names,
    ...                          categorical_features, model_task=model_task)
    >>> filters = [{'arg': [23.85], 'column': 'mean radius',
    ...             'method': 'less and equal'}]
    >>> composite_filters = [{'compositeFilters':
    ...                      [{'compositeFilters':
    ...                       [{'arg': [13.45, 22.27],
    ...                         'column': 'mean radius',
    ...                         'method': 'in the range of'},
    ...                        {'arg': [10.88, 24.46],
    ...                         'column': 'mean texture',
    ...                         'method': 'in the range of'}],
    ...                        'operation': 'and'}],
    ...                      'operation': 'or'}]
    >>> matrix = compute_matrix(analyzer, ['mean radius', 'mean texture'],
    ...                         filters, composite_filters)
    """
    if num_bins <= 0:
        raise ValueError(
            'Number of bins parameter must be greater than 0 for the heatmap')
    if features[0] is None and features[1] is None:
        raise ValueError(
            'One or two features must be specified to compute the heat map')
    filtered_df = filter_from_cohort(analyzer, filters, composite_filters)
    true_y = filtered_df[TRUE_Y]
    dropped_cols = [TRUE_Y, ROW_INDEX]
    is_model_analyzer = hasattr(analyzer, 'model')
    if not is_model_analyzer:
        pred_y = filtered_df[PRED_Y]
        dropped_cols.append(PRED_Y)
    input_data = filtered_df.drop(columns=dropped_cols)
    is_pandas = isinstance(analyzer.dataset, pd.DataFrame)
    metric = analyzer.metric
    if is_pandas:
        true_y = true_y.to_numpy()
    else:
        input_data = input_data.to_numpy()
    if is_model_analyzer:
        pred_y = analyzer.model.predict(input_data)
    if is_model_analyzer:
        if analyzer.model_task == ModelTask.CLASSIFICATION:
            diff = analyzer.model.predict(input_data) != true_y
        else:
            diff = analyzer.model.predict(input_data) - true_y
    else:
        if analyzer.model_task == ModelTask.CLASSIFICATION:
            diff = pred_y != true_y
        else:
            diff = pred_y - true_y
    if not isinstance(diff, np.ndarray):
        diff = np.array(diff)
    if not isinstance(pred_y, np.ndarray):
        pred_y = np.array(pred_y)
    if not isinstance(true_y, np.ndarray):
        true_y = np.array(true_y)
    indexes = []
    for feature in features:
        if feature is None:
            continue
        indexes.append(analyzer.feature_names.index(feature))
    if is_pandas:
        input_data = input_data.to_numpy()
    dataset_sub_features = input_data[:, indexes]
    dataset_sub_names = np.array(analyzer.feature_names)[np.array(indexes)]
    df = pd.DataFrame(dataset_sub_features, columns=dataset_sub_names)
    df_err = df.copy()
    df_err[DIFF] = diff
    if metric == Metrics.ERROR_RATE:
        df_err = df_err[df_err[DIFF]]
    else:
        df_err[TRUE_Y] = true_y
        df_err[PRED_Y] = pred_y
    # construct matrix
    matrix = []
    if len(dataset_sub_names) == 2:
        feat1 = dataset_sub_names[0]
        feat2 = dataset_sub_names[1]
        unique_count1 = len(df[feat1].unique())
        unique_count2 = len(df[feat2].unique())
        f1_is_cat = False
        f2_is_cat = False
        if analyzer.categorical_features is not None:
            f1_is_cat = feat1 in analyzer.categorical_features
            f2_is_cat = feat2 in analyzer.categorical_features
        if unique_count1 > num_bins and not f1_is_cat:
            tabdf1 = bin_data(df,
                              feat1,
                              num_bins,
                              quantile_binning=quantile_binning)
            categories1 = tabdf1.cat.categories
            if len(categories1) < num_bins:
                warn_duplicate_edges(feat1)
            tabdf1_err = bin_data(df_err,
                                  feat1,
                                  categories1,
                                  quantile_binning=quantile_binning)
        else:
            tabdf1 = df[feat1]
            tabdf1_err = df_err[feat1]
            categories1 = np.unique(tabdf1.to_numpy(), return_counts=True)[0]
        if unique_count2 > num_bins and not f2_is_cat:
            tabdf2 = bin_data(df,
                              feat2,
                              num_bins,
                              quantile_binning=quantile_binning)
            categories2 = tabdf2.cat.categories
            if len(categories2) < num_bins:
                warn_duplicate_edges(feat2)
            tabdf2_err = bin_data(df_err,
                                  feat2,
                                  categories2,
                                  quantile_binning=quantile_binning)
        else:
            tabdf2 = df[feat2]
            tabdf2_err = df_err[feat2]
            categories2 = np.unique(tabdf2.to_numpy(), return_counts=True)[0]
        if metric == Metrics.ERROR_RATE:
            matrix_total = pd.crosstab(tabdf1,
                                       tabdf2,
                                       rownames=[feat1],
                                       colnames=[feat2])
            matrix_error = pd.crosstab(tabdf1_err,
                                       tabdf2_err,
                                       rownames=[feat1],
                                       colnames=[feat2])
        else:
            if is_multi_agg_metric(metric):
                ordered_labels = get_ordered_classes(analyzer.classes, true_y,
                                                     pred_y)
                aggfunc = _MultiMetricAggFunc(metric_to_func[metric],
                                              ordered_labels, metric)
            else:
                aggfunc = _AggFunc(metric_to_func[metric])
            matrix_total = pd.crosstab(tabdf1,
                                       tabdf2,
                                       rownames=[feat1],
                                       colnames=[feat2])
            matrix_error = pd.crosstab(tabdf1_err,
                                       tabdf2_err,
                                       rownames=[feat1],
                                       colnames=[feat2],
                                       values=list(
                                           zip(df_err[TRUE_Y],
                                               df_err[PRED_Y])),
                                       aggfunc=aggfunc._agg_func_pair)
            fill_matrix_nulls(matrix_total, aggfunc._fill_na_value())
            fill_matrix_nulls(matrix_error, aggfunc._fill_na_value())
        matrix = matrix_2d(categories1, categories2, matrix_total,
                           matrix_error, metric)
    else:
        feat1 = dataset_sub_names[0]
        unique_count1 = len(df[feat1].unique())
        f1_is_cat = False
        if analyzer.categorical_features is not None:
            f1_is_cat = feat1 in analyzer.categorical_features
        if unique_count1 > num_bins and not f1_is_cat:
            cutdf = bin_data(df,
                             feat1,
                             num_bins,
                             quantile_binning=quantile_binning)
            num_categories = len(cutdf.cat.categories)
            bin_range = range(num_categories)
            if len(cutdf.cat.categories) < num_bins:
                warn_duplicate_edges(feat1)
            catr = cutdf.cat.rename_categories(bin_range)
            catn, counts = np.unique(catr.to_numpy(), return_counts=True)
            # fix counts to include skipped categories
            fix_counts = []
            counts_idx = 0
            for idx, catdf in enumerate(cutdf.cat.categories):
                if idx not in catn:
                    fix_counts.append(0)
                else:
                    fix_counts.append(counts[counts_idx])
                    counts_idx += 1
            counts = fix_counts
            cut_err = bin_data(df_err,
                               feat1,
                               cutdf.cat.categories,
                               quantile_binning=quantile_binning)
            catr_err = cut_err.cat.rename_categories(bin_range)
            val_err, counts_err = np.unique(catr_err.to_numpy(),
                                            return_counts=True)
            val_err = cut_err.cat.categories[val_err]
            categories = cutdf.cat.categories
        else:
            categories, counts = np.unique(df[feat1].to_numpy(),
                                           return_counts=True)
            val_err, counts_err = np.unique(df_err[feat1].to_numpy(),
                                            return_counts=True)
            cut_err = df_err
        # Compute the given metric for each group, if not using error rate
        if metric != Metrics.ERROR_RATE:
            if is_multi_agg_metric(metric):
                ordered_labels = get_ordered_classes(analyzer.classes, true_y,
                                                     pred_y)
                aggfunc = _MultiMetricAggFunc(metric_to_func[metric],
                                              ordered_labels, metric)
            else:
                aggfunc = _AggFunc(metric_to_func[metric])
            cutdf_err = pd.DataFrame(cut_err)
            cutdf_err['metric_values'] = list(
                zip(df_err[TRUE_Y], df_err[PRED_Y]))
            grouped = cutdf_err.groupby([feat1])
            agg_func = {'metric_values': aggfunc._agg_func_grouped}
            counts_err = grouped.agg(agg_func)
            counts_err = counts_err.values.ravel()
        matrix = matrix_1d(categories, val_err, counts, counts_err, metric)
    return matrix