Beispiel #1
0
def compute_matrix(analyzer,
                   features,
                   filters,
                   composite_filters,
                   quantile_binning=False,
                   num_bins=BIN_THRESHOLD):
    if num_bins <= 0:
        raise ValueError(
            'Number of bins parameter must be greater than 0 for the heatmap')
    if features[0] is None and features[1] is None:
        raise ValueError(
            'One or two features must be specified to compute the heat map')
    filtered_df = filter_from_cohort(analyzer, filters, composite_filters)
    true_y = filtered_df[TRUE_Y]
    dropped_cols = [TRUE_Y, ROW_INDEX]
    is_model_analyzer = hasattr(analyzer, 'model')
    if not is_model_analyzer:
        pred_y = filtered_df[PRED_Y]
        dropped_cols.append(PRED_Y)
    input_data = filtered_df.drop(columns=dropped_cols)
    is_pandas = isinstance(analyzer.dataset, pd.DataFrame)
    metric = analyzer.metric
    if is_pandas:
        true_y = true_y.to_numpy()
    else:
        input_data = input_data.to_numpy()
    if is_model_analyzer:
        pred_y = analyzer.model.predict(input_data)
    if is_model_analyzer:
        if analyzer.model_task == ModelTask.CLASSIFICATION:
            diff = analyzer.model.predict(input_data) != true_y
        else:
            diff = analyzer.model.predict(input_data) - true_y
    else:
        if analyzer.model_task == ModelTask.CLASSIFICATION:
            diff = pred_y != true_y
        else:
            diff = pred_y - true_y
    if not isinstance(diff, np.ndarray):
        diff = np.array(diff)
    if not isinstance(pred_y, np.ndarray):
        pred_y = np.array(pred_y)
    if not isinstance(true_y, np.ndarray):
        true_y = np.array(true_y)
    indexes = []
    for feature in features:
        if feature is None:
            continue
        indexes.append(analyzer.feature_names.index(feature))
    if is_pandas:
        input_data = input_data.to_numpy()
    dataset_sub_features = input_data[:, indexes]
    dataset_sub_names = np.array(analyzer.feature_names)[np.array(indexes)]
    df = pd.DataFrame(dataset_sub_features, columns=dataset_sub_names)
    df_err = df.copy()
    df_err[DIFF] = diff
    if metric == Metrics.ERROR_RATE:
        df_err = df_err[df_err[DIFF]]
    else:
        df_err[TRUE_Y] = true_y
        df_err[PRED_Y] = pred_y
    # construct matrix
    matrix = []
    if len(dataset_sub_names) == 2:
        feat1 = dataset_sub_names[0]
        feat2 = dataset_sub_names[1]
        unique_count1 = len(df[feat1].unique())
        unique_count2 = len(df[feat2].unique())
        f1_is_cat = False
        f2_is_cat = False
        if analyzer.categorical_features is not None:
            f1_is_cat = feat1 in analyzer.categorical_features
            f2_is_cat = feat2 in analyzer.categorical_features
        if unique_count1 > num_bins and not f1_is_cat:
            tabdf1 = bin_data(df,
                              feat1,
                              num_bins,
                              quantile_binning=quantile_binning)
            categories1 = tabdf1.cat.categories
            if len(categories1) < num_bins:
                warn_duplicate_edges(feat1)
            tabdf1_err = bin_data(df_err,
                                  feat1,
                                  categories1,
                                  quantile_binning=quantile_binning)
        else:
            tabdf1 = df[feat1]
            tabdf1_err = df_err[feat1]
            categories1 = np.unique(tabdf1.to_numpy(), return_counts=True)[0]
        if unique_count2 > num_bins and not f2_is_cat:
            tabdf2 = bin_data(df,
                              feat2,
                              num_bins,
                              quantile_binning=quantile_binning)
            categories2 = tabdf2.cat.categories
            if len(categories2) < num_bins:
                warn_duplicate_edges(feat2)
            tabdf2_err = bin_data(df_err,
                                  feat2,
                                  categories2,
                                  quantile_binning=quantile_binning)
        else:
            tabdf2 = df[feat2]
            tabdf2_err = df_err[feat2]
            categories2 = np.unique(tabdf2.to_numpy(), return_counts=True)[0]
        if metric == Metrics.ERROR_RATE:
            matrix_total = pd.crosstab(tabdf1,
                                       tabdf2,
                                       rownames=[feat1],
                                       colnames=[feat2])
            matrix_error = pd.crosstab(tabdf1_err,
                                       tabdf2_err,
                                       rownames=[feat1],
                                       colnames=[feat2])
        else:
            if is_multi_agg_metric(metric):
                ordered_labels = get_ordered_classes(analyzer.classes, true_y,
                                                     pred_y)
                aggfunc = _MultiMetricAggFunc(metric_to_func[metric],
                                              ordered_labels, metric)
            else:
                aggfunc = _AggFunc(metric_to_func[metric])
            matrix_total = pd.crosstab(tabdf1,
                                       tabdf2,
                                       rownames=[feat1],
                                       colnames=[feat2])
            matrix_error = pd.crosstab(tabdf1_err,
                                       tabdf2_err,
                                       rownames=[feat1],
                                       colnames=[feat2],
                                       values=list(
                                           zip(df_err[TRUE_Y],
                                               df_err[PRED_Y])),
                                       aggfunc=aggfunc._agg_func_pair)
            fill_matrix_nulls(matrix_total, aggfunc._fill_na_value())
            fill_matrix_nulls(matrix_error, aggfunc._fill_na_value())
        matrix = matrix_2d(categories1, categories2, matrix_total,
                           matrix_error, metric)
    else:
        feat1 = dataset_sub_names[0]
        unique_count1 = len(df[feat1].unique())
        f1_is_cat = False
        if analyzer.categorical_features is not None:
            f1_is_cat = feat1 in analyzer.categorical_features
        if unique_count1 > num_bins and not f1_is_cat:
            cutdf = bin_data(df,
                             feat1,
                             num_bins,
                             quantile_binning=quantile_binning)
            num_categories = len(cutdf.cat.categories)
            bin_range = range(num_categories)
            if len(cutdf.cat.categories) < num_bins:
                warn_duplicate_edges(feat1)
            catr = cutdf.cat.rename_categories(bin_range)
            catn, counts = np.unique(catr.to_numpy(), return_counts=True)
            # fix counts to include skipped categories
            fix_counts = []
            counts_idx = 0
            for idx, catdf in enumerate(cutdf.cat.categories):
                if idx not in catn:
                    fix_counts.append(0)
                else:
                    fix_counts.append(counts[counts_idx])
                    counts_idx += 1
            counts = fix_counts
            cut_err = bin_data(df_err,
                               feat1,
                               cutdf.cat.categories,
                               quantile_binning=quantile_binning)
            catr_err = cut_err.cat.rename_categories(bin_range)
            val_err, counts_err = np.unique(catr_err.to_numpy(),
                                            return_counts=True)
            val_err = cut_err.cat.categories[val_err]
            categories = cutdf.cat.categories
        else:
            categories, counts = np.unique(df[feat1].to_numpy(),
                                           return_counts=True)
            val_err, counts_err = np.unique(df_err[feat1].to_numpy(),
                                            return_counts=True)
            cut_err = df_err
        # Compute the given metric for each group, if not using error rate
        if metric != Metrics.ERROR_RATE:
            if is_multi_agg_metric(metric):
                ordered_labels = get_ordered_classes(analyzer.classes, true_y,
                                                     pred_y)
                aggfunc = _MultiMetricAggFunc(metric_to_func[metric],
                                              ordered_labels, metric)
            else:
                aggfunc = _AggFunc(metric_to_func[metric])
            cutdf_err = pd.DataFrame(cut_err)
            cutdf_err['metric_values'] = list(
                zip(df_err[TRUE_Y], df_err[PRED_Y]))
            grouped = cutdf_err.groupby([feat1])
            agg_func = {'metric_values': aggfunc._agg_func_triplet}
            counts_err = grouped.agg(agg_func)
            counts_err = counts_err.values.ravel()
        matrix = matrix_1d(categories, val_err, counts, counts_err, metric)
    return matrix
Beispiel #2
0
def matrix_1d(categories, values_err, counts, counts_err, metric):
    matrix = []
    matrix_row = []
    for col_idx in reversed(range(len(categories))):
        cat = categories[col_idx]
        if metric == Metrics.ERROR_RATE:
            false_count = 0
            if cat in values_err:
                index_err = list(values_err).index(cat)
                false_count = int(counts_err[index_err])
            matrix_row.append({
                FALSE_COUNT: false_count,
                COUNT: int(counts[col_idx]),
                METRIC_NAME: metric_to_display_name[metric]
            })
        elif is_multi_agg_metric(metric):
            tp_sum = []
            fp_sum = []
            fn_sum = []
            tn_sum = []
            metric_value = 0
            error = 0
            if cat in values_err:
                metric_value = float(counts_err[col_idx][0])
                tp_sum = counts_err[col_idx][1]
                fp_sum = counts_err[col_idx][2]
                fn_sum = counts_err[col_idx][3]
                tn_sum = counts_err[col_idx][4]
                error = float(counts_err[col_idx][5])
                if math.isnan(metric_value):
                    metric_value = 0.0
            matrix_row.append({
                METRIC_VALUE: metric_value,
                TP: tp_sum,
                FP: fp_sum,
                FN: fn_sum,
                TN: tn_sum,
                ERROR: error,
                METRIC_NAME: metric_to_display_name[metric],
                COUNT: int(counts[col_idx])
            })
        else:
            metric_value = 0
            if cat in values_err:
                metric_value = float(counts_err[col_idx])
                if math.isnan(metric_value):
                    metric_value = 0.0
            matrix_row.append({
                METRIC_VALUE: metric_value,
                METRIC_NAME: metric_to_display_name[metric],
                COUNT: int(counts[col_idx])
            })
    matrix.append(matrix_row)
    category = []
    category_min_interval = []
    category_max_interval = []
    for cat in reversed(categories):
        if isinstance(categories, pd.IntervalIndex):
            category_min_interval.append(cat.left)
            category_max_interval.append(cat.right)
            category.append(str(cat))
        else:
            category.append(get_py_value(cat))
    category1 = {
        VALUES: category,
        INTERVAL_MIN: category_min_interval,
        INTERVAL_MAX: category_max_interval
    }
    return {MATRIX: matrix, CATEGORY1: category1}
Beispiel #3
0
def matrix_2d(categories1, categories2, matrix_counts, matrix_err_counts,
              metric):
    matrix = []
    category1 = []
    category1_min_interval = []
    category1_max_interval = []
    is_multi_agg = is_multi_agg_metric(metric)
    for row_index in reversed(range(len(categories1))):
        matrix_row = []
        cat1 = categories1[row_index]
        if isinstance(categories1, pd.IntervalIndex):
            category1_min_interval.append(cat1.left)
            category1_max_interval.append(cat1.right)
            category1.append(str(cat1))
        else:
            category1.append(get_py_value(cat1))
        for col_index in range(len(categories2)):
            cat2 = categories2[col_index]
            index_exists_err = cat1 in matrix_err_counts.index
            col_exists_err = cat2 in matrix_err_counts.columns
            if metric == Metrics.ERROR_RATE:
                false_count = 0
                if index_exists_err and col_exists_err:
                    false_count = int(matrix_err_counts.loc[cat1, cat2])
            elif is_multi_agg:
                tp_sum = []
                fp_sum = []
                fn_sum = []
                tn_sum = []
                metric_value = 0
                if index_exists_err and col_exists_err:
                    metric_value = float(matrix_err_counts.loc[cat1, cat2][0])
                    tp_sum = matrix_err_counts.loc[cat1, cat2][1]
                    fp_sum = matrix_err_counts.loc[cat1, cat2][2]
                    fn_sum = matrix_err_counts.loc[cat1, cat2][3]
                    tn_sum = matrix_err_counts.loc[cat1, cat2][4]
                    error = float(matrix_err_counts.loc[cat1, cat2][5])
            else:
                metric_value = 0
                if index_exists_err and col_exists_err:
                    metric_value = float(matrix_err_counts.loc[cat1, cat2])
            index_exists = cat1 in matrix_counts.index
            col_exists = cat2 in matrix_counts.columns
            total_count = 0
            if index_exists and col_exists:
                total_count = int(matrix_counts.loc[cat1, cat2])
            if metric == Metrics.ERROR_RATE:
                matrix_row.append({
                    FALSE_COUNT: false_count,
                    COUNT: total_count,
                    METRIC_NAME: metric_to_display_name[metric]
                })
            elif is_multi_agg:
                matrix_row.append({
                    METRIC_VALUE: metric_value,
                    TP: tp_sum,
                    FP: fp_sum,
                    FN: fn_sum,
                    TN: tn_sum,
                    ERROR: error,
                    METRIC_NAME: metric_to_display_name[metric],
                    COUNT: total_count
                })
            else:
                matrix_row.append({
                    METRIC_VALUE: metric_value,
                    METRIC_NAME: metric_to_display_name[metric],
                    COUNT: total_count
                })
        matrix.append(matrix_row)

    category2 = []
    category2_min_interval = []
    category2_max_interval = []
    for cat2 in categories2:
        if isinstance(categories2, pd.IntervalIndex):
            category2_min_interval.append(cat2.left)
            category2_max_interval.append(cat2.right)
            category2.append(str(cat2))
        else:
            category2.append(get_py_value(cat2))
    category1 = {
        VALUES: category1,
        INTERVAL_MIN: category1_min_interval,
        INTERVAL_MAX: category1_max_interval
    }
    category2 = {
        VALUES: category2,
        INTERVAL_MIN: category2_min_interval,
        INTERVAL_MAX: category2_max_interval
    }
    return {MATRIX: matrix, CATEGORY1: category1, CATEGORY2: category2}
Beispiel #4
0
def validate_matrix_metric(matrix, exp_total_count, exp_total_error, metric,
                           num_cat1, num_cat2):
    is_precision = metric in precision_metrics
    is_recall = metric in recall_metrics
    is_accuracy = metric == Metrics.ACCURACY_SCORE
    is_f1_score = metric in f1_metrics
    is_multi_agg = is_multi_agg_metric(metric)
    if metric == Metrics.ERROR_RATE:
        # take sum of count, false count
        total_count = 0
        total_false_count = 0
        for i in range(num_cat1):
            for j in range(num_cat2):
                cell_count = matrix[MATRIX][i][j][COUNT]
                assert cell_count >= 0
                total_count += cell_count
                cell_false_count = matrix[MATRIX][i][j][FALSE_COUNT]
                assert cell_false_count >= 0
                total_false_count += cell_false_count
        assert exp_total_count == total_count
        assert exp_total_error == total_false_count
    elif (metric == Metrics.MEAN_SQUARED_ERROR
          or metric == Metrics.MEAN_ABSOLUTE_ERROR):
        # take sum of count, metric value
        total_count = 0
        total_metric_value = 0
        for i in range(num_cat1):
            for j in range(num_cat2):
                count = matrix[MATRIX][i][j][COUNT]
                assert count >= 0
                total_count += count
                cell_metric_value = matrix[MATRIX][i][j][METRIC_VALUE]
                assert cell_metric_value >= 0
                cell_value = cell_metric_value * count
                total_metric_value += cell_value
                metric_name = matrix[MATRIX][i][j][METRIC_NAME]
                assert metric_name == metric_to_display_name[metric]
        total_metric_value = total_metric_value / total_count
        assert exp_total_count == total_count
        assert abs(exp_total_error - total_metric_value) < TOLERANCE
    elif is_multi_agg:
        # compute the overall metric from the data in each of the cells
        total_count = 0
        total_metric_value = 0
        cell_tp_value = None
        cell_tn_value = None
        cell_fp_value = None
        cell_fn_value = None
        for i in range(num_cat1):
            for j in range(num_cat2):
                count = matrix[MATRIX][i][j][COUNT]
                assert count >= 0
                total_count += count
                cell_metric_value = matrix[MATRIX][i][j][METRIC_VALUE]
                assert cell_metric_value >= 0
                if cell_tp_value is None:
                    cell_tp_value = np.array(matrix[MATRIX][i][j][TP],
                                             dtype=FLOAT64)
                else:
                    cell_tp_value += np.array(matrix[MATRIX][i][j][TP],
                                              dtype=FLOAT64)
                if is_precision or is_accuracy or is_f1_score:
                    if cell_fp_value is None:
                        cell_fp_value = np.array(matrix[MATRIX][i][j][FP],
                                                 dtype=FLOAT64)
                    else:
                        cell_fp_value += np.array(matrix[MATRIX][i][j][FP],
                                                  dtype=FLOAT64)
                if is_recall or is_accuracy or is_f1_score:
                    if cell_fn_value is None:
                        cell_fn_value = np.array(matrix[MATRIX][i][j][FN],
                                                 dtype=FLOAT64)
                    else:
                        cell_fn_value += np.array(matrix[MATRIX][i][j][FN],
                                                  dtype=FLOAT64)
                if is_accuracy:
                    if cell_tn_value is None:
                        cell_tn_value = np.array(matrix[MATRIX][i][j][TN],
                                                 dtype=FLOAT64)
                    else:
                        cell_tn_value += np.array(matrix[MATRIX][i][j][TN],
                                                  dtype=FLOAT64)
                metric_name = matrix[MATRIX][i][j][METRIC_NAME]
                assert metric_name == metric_to_display_name[metric]
        tp_sum = cell_tp_value.sum()
        is_overall_precision = (metric == Metrics.MICRO_PRECISION_SCORE
                                or metric == Metrics.PRECISION_SCORE)
        is_overall_recall = (metric == Metrics.MICRO_RECALL_SCORE
                             or metric == Metrics.RECALL_SCORE)
        is_overall_f1_score = (metric == Metrics.MICRO_F1_SCORE
                               or metric == Metrics.F1_SCORE)
        if is_overall_precision:
            total_metric_value = tp_sum / (tp_sum + cell_fp_value.sum())
        elif is_overall_recall:
            total_metric_value = tp_sum / (tp_sum + cell_fn_value.sum())
        elif metric == Metrics.MACRO_PRECISION_SCORE:
            per_class_metrics = cell_tp_value / (cell_tp_value + cell_fp_value)
            num_classes = len(per_class_metrics)
            total_metric_value = per_class_metrics.sum() / num_classes
        elif metric == Metrics.MACRO_RECALL_SCORE:
            per_class_metrics = cell_tp_value / (cell_tp_value + cell_fn_value)
            num_classes = len(per_class_metrics)
            total_metric_value = per_class_metrics.sum() / num_classes
        elif metric == Metrics.PRECISION_SCORE:
            total_metric_value = tp_sum / (tp_sum + cell_fp_value.sum())
        elif metric == Metrics.ACCURACY_SCORE:
            if len(cell_tn_value) < 2:
                tn_sum = cell_tn_value.sum()
                fn_sum = cell_fn_value.sum()
                fp_sum = cell_fp_value.sum()
                num_correct = tp_sum + tn_sum
                num_total = num_correct + fn_sum + fp_sum
                total_metric_value = num_correct / num_total
            else:
                num_total = (cell_tp_value[0] + cell_tn_value[0] +
                             cell_fn_value[0] + cell_fp_value[0])
                total_metric_value = tp_sum / num_total
        elif is_overall_f1_score:
            fn_sum = cell_fn_value.sum()
            fp_sum = cell_fp_value.sum()
            total_metric_value = tp_sum / (tp_sum + (fp_sum + fn_sum) / 2)
        elif metric == Metrics.MACRO_F1_SCORE:
            pc_precision = cell_tp_value / (cell_tp_value + cell_fp_value)
            pc_recall = cell_tp_value / (cell_tp_value + cell_fn_value)
            num_classes = len(pc_precision)
            pc_f1_score = (2 * (pc_precision * pc_recall) /
                           (pc_precision + pc_recall)).sum()
            total_metric_value = pc_f1_score / num_classes
        assert exp_total_count == total_count
        assert abs(exp_total_error - total_metric_value) < TOLERANCE
    else:
        raise NotImplementedError(
            "Metric {} validation not supported yet".format(metric))
Beispiel #5
0
def matrix_1d(categories, values_err, counts, counts_err, metric):
    """Constructs a 1D matrix.

    The matrix is in a dictionary format which can then be saved to JSON.

    :param categories: The categories for the selected feature.
    :type categories: list
    :param values_err: The error values for the selected feature.
    :type values_err: list
    :param counts: The counts for the selected feature.
    :type counts: list
    :param counts_err: The error counts for the selected feature.
    :type counts_err: list
    :param metric: The calculated metric.
    :type metric: str
    :returns: The 1D matrix dictionary.
    :rtype: dict
    """
    matrix = []
    matrix_row = []
    for col_idx in reversed(range(len(categories))):
        cat = categories[col_idx]
        if metric == Metrics.ERROR_RATE:
            false_count = 0
            if cat in values_err:
                index_err = list(values_err).index(cat)
                false_count = int(counts_err[index_err])
            matrix_row.append({
                FALSE_COUNT: false_count,
                COUNT: int(counts[col_idx]),
                METRIC_NAME: metric_to_display_name[metric]
            })
        elif is_multi_agg_metric(metric):
            tp_sum = []
            fp_sum = []
            fn_sum = []
            tn_sum = []
            metric_value = 0
            error = 0
            if cat in values_err:
                metric_value = float(counts_err[col_idx][0])
                tp_sum = counts_err[col_idx][1]
                fp_sum = counts_err[col_idx][2]
                fn_sum = counts_err[col_idx][3]
                tn_sum = counts_err[col_idx][4]
                error = float(counts_err[col_idx][5])
                if math.isnan(metric_value):
                    metric_value = 0.0
            matrix_row.append({
                METRIC_VALUE: metric_value,
                TP: tp_sum,
                FP: fp_sum,
                FN: fn_sum,
                TN: tn_sum,
                ERROR: error,
                METRIC_NAME: metric_to_display_name[metric],
                COUNT: int(counts[col_idx])
            })
        else:
            metric_value = 0
            if cat in values_err:
                metric_value = float(counts_err[col_idx])
                if math.isnan(metric_value):
                    metric_value = 0.0
            matrix_row.append({
                METRIC_VALUE: metric_value,
                METRIC_NAME: metric_to_display_name[metric],
                COUNT: int(counts[col_idx])
            })
    matrix.append(matrix_row)
    category = []
    category_min_interval = []
    category_max_interval = []
    for cat in reversed(categories):
        if isinstance(categories, pd.IntervalIndex):
            category_min_interval.append(cat.left)
            category_max_interval.append(cat.right)
            category.append(str(cat))
        else:
            category.append(get_py_value(cat))
    category1 = {
        VALUES: category,
        INTERVAL_MIN: category_min_interval,
        INTERVAL_MAX: category_max_interval
    }
    return {MATRIX: matrix, CATEGORY1: category1}
Beispiel #6
0
def compute_matrix(analyzer,
                   features,
                   filters,
                   composite_filters,
                   quantile_binning=False,
                   num_bins=BIN_THRESHOLD):
    """Compute a matrix of metrics for a given set of feature names.

    The filters and composite filters are used to filter the data
    prior to computing the matrix.

    :param analyzer: The error analyzer.
    :type analyzer: BaseAnalyzer
    :param features: A list of one or two feature names to compute metrics for.
    :type features: list
    :param filters: A list of filters to apply to the data.
    :type filters: list
    :param composite_filters: A list of composite filters to apply to the data.
    :type composite_filters: list
    :param quantile_binning: Whether to use quantile binning.
    :type quantile_binning: bool
    :param num_bins: The number of bins to use for quantile binning.
    :type num_bins: int
    :return: A dictionary representation of the computed matrix which can be
        saved to JSON.
    :rtype: dict

    :Example:

    An example of running compute_matrix with a filter and a composite
    filter:

    >>> from erroranalysis._internal.error_analyzer import ModelAnalyzer
    >>> from erroranalysis._internal.matrix_filter import (
    ...     compute_matrix)
    >>> from erroranalysis._internal.constants import ModelTask
    >>> from sklearn.datasets import load_breast_cancer
    >>> from sklearn.model_selection import train_test_split
    >>> from sklearn import svm
    >>> breast_cancer_data = load_breast_cancer()
    >>> feature_names = breast_cancer_data.feature_names
    >>> X_train, X_test, y_train, y_test = train_test_split(
    ...     breast_cancer_data.data, breast_cancer_data.target,
    ...     test_size=0.5, random_state=0)
    >>> categorical_features = []
    >>> clf = svm.SVC(gamma=0.001, C=100., probability=True,
    ...               random_state=777)
    >>> model = clf.fit(X_train, y_train)
    >>> model_task = ModelTask.CLASSIFICATION
    >>> analyzer = ModelAnalyzer(model, X_test, y_test, feature_names,
    ...                          categorical_features, model_task=model_task)
    >>> filters = [{'arg': [23.85], 'column': 'mean radius',
    ...             'method': 'less and equal'}]
    >>> composite_filters = [{'compositeFilters':
    ...                      [{'compositeFilters':
    ...                       [{'arg': [13.45, 22.27],
    ...                         'column': 'mean radius',
    ...                         'method': 'in the range of'},
    ...                        {'arg': [10.88, 24.46],
    ...                         'column': 'mean texture',
    ...                         'method': 'in the range of'}],
    ...                        'operation': 'and'}],
    ...                      'operation': 'or'}]
    >>> matrix = compute_matrix(analyzer, ['mean radius', 'mean texture'],
    ...                         filters, composite_filters)
    """
    if num_bins <= 0:
        raise ValueError(
            'Number of bins parameter must be greater than 0 for the heatmap')
    if features[0] is None and features[1] is None:
        raise ValueError(
            'One or two features must be specified to compute the heat map')
    filtered_df = filter_from_cohort(analyzer, filters, composite_filters)
    true_y = filtered_df[TRUE_Y]
    dropped_cols = [TRUE_Y, ROW_INDEX]
    is_model_analyzer = hasattr(analyzer, 'model')
    if not is_model_analyzer:
        pred_y = filtered_df[PRED_Y]
        dropped_cols.append(PRED_Y)
    input_data = filtered_df.drop(columns=dropped_cols)
    is_pandas = isinstance(analyzer.dataset, pd.DataFrame)
    metric = analyzer.metric
    if is_pandas:
        true_y = true_y.to_numpy()
    else:
        input_data = input_data.to_numpy()
    if is_model_analyzer:
        pred_y = analyzer.model.predict(input_data)
    if is_model_analyzer:
        if analyzer.model_task == ModelTask.CLASSIFICATION:
            diff = analyzer.model.predict(input_data) != true_y
        else:
            diff = analyzer.model.predict(input_data) - true_y
    else:
        if analyzer.model_task == ModelTask.CLASSIFICATION:
            diff = pred_y != true_y
        else:
            diff = pred_y - true_y
    if not isinstance(diff, np.ndarray):
        diff = np.array(diff)
    if not isinstance(pred_y, np.ndarray):
        pred_y = np.array(pred_y)
    if not isinstance(true_y, np.ndarray):
        true_y = np.array(true_y)
    indexes = []
    for feature in features:
        if feature is None:
            continue
        indexes.append(analyzer.feature_names.index(feature))
    if is_pandas:
        input_data = input_data.to_numpy()
    dataset_sub_features = input_data[:, indexes]
    dataset_sub_names = np.array(analyzer.feature_names)[np.array(indexes)]
    df = pd.DataFrame(dataset_sub_features, columns=dataset_sub_names)
    df_err = df.copy()
    df_err[DIFF] = diff
    if metric == Metrics.ERROR_RATE:
        df_err = df_err[df_err[DIFF]]
    else:
        df_err[TRUE_Y] = true_y
        df_err[PRED_Y] = pred_y
    # construct matrix
    matrix = []
    if len(dataset_sub_names) == 2:
        feat1 = dataset_sub_names[0]
        feat2 = dataset_sub_names[1]
        unique_count1 = len(df[feat1].unique())
        unique_count2 = len(df[feat2].unique())
        f1_is_cat = False
        f2_is_cat = False
        if analyzer.categorical_features is not None:
            f1_is_cat = feat1 in analyzer.categorical_features
            f2_is_cat = feat2 in analyzer.categorical_features
        if unique_count1 > num_bins and not f1_is_cat:
            tabdf1 = bin_data(df,
                              feat1,
                              num_bins,
                              quantile_binning=quantile_binning)
            categories1 = tabdf1.cat.categories
            if len(categories1) < num_bins:
                warn_duplicate_edges(feat1)
            tabdf1_err = bin_data(df_err,
                                  feat1,
                                  categories1,
                                  quantile_binning=quantile_binning)
        else:
            tabdf1 = df[feat1]
            tabdf1_err = df_err[feat1]
            categories1 = np.unique(tabdf1.to_numpy(), return_counts=True)[0]
        if unique_count2 > num_bins and not f2_is_cat:
            tabdf2 = bin_data(df,
                              feat2,
                              num_bins,
                              quantile_binning=quantile_binning)
            categories2 = tabdf2.cat.categories
            if len(categories2) < num_bins:
                warn_duplicate_edges(feat2)
            tabdf2_err = bin_data(df_err,
                                  feat2,
                                  categories2,
                                  quantile_binning=quantile_binning)
        else:
            tabdf2 = df[feat2]
            tabdf2_err = df_err[feat2]
            categories2 = np.unique(tabdf2.to_numpy(), return_counts=True)[0]
        if metric == Metrics.ERROR_RATE:
            matrix_total = pd.crosstab(tabdf1,
                                       tabdf2,
                                       rownames=[feat1],
                                       colnames=[feat2])
            matrix_error = pd.crosstab(tabdf1_err,
                                       tabdf2_err,
                                       rownames=[feat1],
                                       colnames=[feat2])
        else:
            if is_multi_agg_metric(metric):
                ordered_labels = get_ordered_classes(analyzer.classes, true_y,
                                                     pred_y)
                aggfunc = _MultiMetricAggFunc(metric_to_func[metric],
                                              ordered_labels, metric)
            else:
                aggfunc = _AggFunc(metric_to_func[metric])
            matrix_total = pd.crosstab(tabdf1,
                                       tabdf2,
                                       rownames=[feat1],
                                       colnames=[feat2])
            matrix_error = pd.crosstab(tabdf1_err,
                                       tabdf2_err,
                                       rownames=[feat1],
                                       colnames=[feat2],
                                       values=list(
                                           zip(df_err[TRUE_Y],
                                               df_err[PRED_Y])),
                                       aggfunc=aggfunc._agg_func_pair)
            fill_matrix_nulls(matrix_total, aggfunc._fill_na_value())
            fill_matrix_nulls(matrix_error, aggfunc._fill_na_value())
        matrix = matrix_2d(categories1, categories2, matrix_total,
                           matrix_error, metric)
    else:
        feat1 = dataset_sub_names[0]
        unique_count1 = len(df[feat1].unique())
        f1_is_cat = False
        if analyzer.categorical_features is not None:
            f1_is_cat = feat1 in analyzer.categorical_features
        if unique_count1 > num_bins and not f1_is_cat:
            cutdf = bin_data(df,
                             feat1,
                             num_bins,
                             quantile_binning=quantile_binning)
            num_categories = len(cutdf.cat.categories)
            bin_range = range(num_categories)
            if len(cutdf.cat.categories) < num_bins:
                warn_duplicate_edges(feat1)
            catr = cutdf.cat.rename_categories(bin_range)
            catn, counts = np.unique(catr.to_numpy(), return_counts=True)
            # fix counts to include skipped categories
            fix_counts = []
            counts_idx = 0
            for idx, catdf in enumerate(cutdf.cat.categories):
                if idx not in catn:
                    fix_counts.append(0)
                else:
                    fix_counts.append(counts[counts_idx])
                    counts_idx += 1
            counts = fix_counts
            cut_err = bin_data(df_err,
                               feat1,
                               cutdf.cat.categories,
                               quantile_binning=quantile_binning)
            catr_err = cut_err.cat.rename_categories(bin_range)
            val_err, counts_err = np.unique(catr_err.to_numpy(),
                                            return_counts=True)
            val_err = cut_err.cat.categories[val_err]
            categories = cutdf.cat.categories
        else:
            categories, counts = np.unique(df[feat1].to_numpy(),
                                           return_counts=True)
            val_err, counts_err = np.unique(df_err[feat1].to_numpy(),
                                            return_counts=True)
            cut_err = df_err
        # Compute the given metric for each group, if not using error rate
        if metric != Metrics.ERROR_RATE:
            if is_multi_agg_metric(metric):
                ordered_labels = get_ordered_classes(analyzer.classes, true_y,
                                                     pred_y)
                aggfunc = _MultiMetricAggFunc(metric_to_func[metric],
                                              ordered_labels, metric)
            else:
                aggfunc = _AggFunc(metric_to_func[metric])
            cutdf_err = pd.DataFrame(cut_err)
            cutdf_err['metric_values'] = list(
                zip(df_err[TRUE_Y], df_err[PRED_Y]))
            grouped = cutdf_err.groupby([feat1])
            agg_func = {'metric_values': aggfunc._agg_func_grouped}
            counts_err = grouped.agg(agg_func)
            counts_err = counts_err.values.ravel()
        matrix = matrix_1d(categories, val_err, counts, counts_err, metric)
    return matrix
Beispiel #7
0
def matrix_2d(categories1, categories2, matrix_counts, matrix_err_counts,
              metric):
    """Constructs a 2D matrix.

    The matrix is in a dictionary format which can then be saved to JSON.

    :param categories1: The categories for the first selected feature.
    :type categories1: list
    :param categories2: The categories for the second selected feature.
    :type categories2: list
    :param matrix_counts: The matrix counts.
    :type matrix_counts: numpy.ndarray
    :param matrix_err_counts: The matrix error counts.
    :type matrix_err_counts: numpy.ndarray
    :param metric: The calculated metric.
    :type metric: str
    :returns: The 2D matrix dictionary.
    :rtype: dict
    """
    matrix = []
    category1 = []
    category1_min_interval = []
    category1_max_interval = []
    is_multi_agg = is_multi_agg_metric(metric)
    for row_index in reversed(range(len(categories1))):
        matrix_row = []
        cat1 = categories1[row_index]
        if isinstance(categories1, pd.IntervalIndex):
            category1_min_interval.append(cat1.left)
            category1_max_interval.append(cat1.right)
            category1.append(str(cat1))
        else:
            category1.append(get_py_value(cat1))
        for col_index in range(len(categories2)):
            cat2 = categories2[col_index]
            index_exists_err = cat1 in matrix_err_counts.index
            col_exists_err = cat2 in matrix_err_counts.columns
            if metric == Metrics.ERROR_RATE:
                false_count = 0
                if index_exists_err and col_exists_err:
                    false_count = int(matrix_err_counts.loc[cat1, cat2])
            elif is_multi_agg:
                tp_sum = []
                fp_sum = []
                fn_sum = []
                tn_sum = []
                metric_value = 0
                if index_exists_err and col_exists_err:
                    metric_value = float(matrix_err_counts.loc[cat1, cat2][0])
                    tp_sum = matrix_err_counts.loc[cat1, cat2][1]
                    fp_sum = matrix_err_counts.loc[cat1, cat2][2]
                    fn_sum = matrix_err_counts.loc[cat1, cat2][3]
                    tn_sum = matrix_err_counts.loc[cat1, cat2][4]
                    error = float(matrix_err_counts.loc[cat1, cat2][5])
            else:
                metric_value = 0
                if index_exists_err and col_exists_err:
                    metric_value = float(matrix_err_counts.loc[cat1, cat2])
            index_exists = cat1 in matrix_counts.index
            col_exists = cat2 in matrix_counts.columns
            total_count = 0
            if index_exists and col_exists:
                total_count = int(matrix_counts.loc[cat1, cat2])
            if metric == Metrics.ERROR_RATE:
                matrix_row.append({
                    FALSE_COUNT: false_count,
                    COUNT: total_count,
                    METRIC_NAME: metric_to_display_name[metric]
                })
            elif is_multi_agg:
                matrix_row.append({
                    METRIC_VALUE: metric_value,
                    TP: tp_sum,
                    FP: fp_sum,
                    FN: fn_sum,
                    TN: tn_sum,
                    ERROR: error,
                    METRIC_NAME: metric_to_display_name[metric],
                    COUNT: total_count
                })
            else:
                matrix_row.append({
                    METRIC_VALUE: metric_value,
                    METRIC_NAME: metric_to_display_name[metric],
                    COUNT: total_count
                })
        matrix.append(matrix_row)

    category2 = []
    category2_min_interval = []
    category2_max_interval = []
    for cat2 in categories2:
        if isinstance(categories2, pd.IntervalIndex):
            category2_min_interval.append(cat2.left)
            category2_max_interval.append(cat2.right)
            category2.append(str(cat2))
        else:
            category2.append(get_py_value(cat2))
    category1 = {
        VALUES: category1,
        INTERVAL_MIN: category1_min_interval,
        INTERVAL_MAX: category1_max_interval
    }
    category2 = {
        VALUES: category2,
        INTERVAL_MIN: category2_min_interval,
        INTERVAL_MAX: category2_max_interval
    }
    return {MATRIX: matrix, CATEGORY1: category1, CATEGORY2: category2}