def _add_filter_cols(self, df, filters): """Adds special columns to the dataset for filtering and postprocessing. :param df: The dataset as a pandas dataframe. :type df: pandas.DataFrame :param filters: The filters. :type filters: list[dict] """ has_classification_outcome = self._filters_has_classification_outcome( filters) has_regression_error = self._filters_has_regression_error(filters) if isinstance(self.true_y, str): df.rename(columns={self.true_y: TRUE_Y}) else: df[TRUE_Y] = self.true_y if self.model is None: df[PRED_Y] = self.pred_y if not is_spark(df): df[ROW_INDEX] = np.arange(0, len(self.true_y)) if has_classification_outcome: if PRED_Y in df: pred_y = df[PRED_Y] else: # calculate directly via prediction on model pred_y = self.model.predict( df.drop(columns=[TRUE_Y, ROW_INDEX])) classes = get_ordered_classes( self.classes, self.true_y, pred_y) # calculate classification outcome and add to df if len(classes) == 2: df[CLASSIFICATION_OUTCOME] = \ self._compute_binary_classification_outcome_data( self.true_y, pred_y, classes) else: df[CLASSIFICATION_OUTCOME] = \ self._compute_multiclass_classification_outcome_data( self.true_y, pred_y) elif has_regression_error: if PRED_Y in df: pred_y = df[PRED_Y] else: # calculate directly via prediction on model pred_y = self.model.predict( df.drop(columns=[TRUE_Y, ROW_INDEX])) # calculate regression error and add to df df[REGRESSION_ERROR] = self._compute_regression_error_data( self.true_y, pred_y)
def add_filter_cols(analyzer, df, filters, true_y): """Adds special columns to the dataset for filtering and postprocessing. :param analyzer: The error analyzer. :type: BaseAnalyzer :param filters: The filters. :type filters: list[dict] :param true_y: The true labels. :type true_y: list """ has_classification_outcome = filters_has_classification_outcome( analyzer, filters) if isinstance(true_y, str): df.rename(columns={true_y: TRUE_Y}) else: df[TRUE_Y] = true_y is_model_analyzer = hasattr(analyzer, MODEL) if not is_model_analyzer: df[PRED_Y] = analyzer.pred_y if not is_spark(df): df[ROW_INDEX] = np.arange(0, len(true_y)) if has_classification_outcome: if PRED_Y in df: pred_y = df[PRED_Y] else: # calculate directly via prediction on model pred_y = analyzer.model.predict( df.drop(columns=[TRUE_Y, ROW_INDEX])) classes = get_ordered_classes(analyzer.classes, true_y, pred_y) # calculate classification outcome and add to df classification_outcome = [] if not isinstance(pred_y, np.ndarray): pred_y = np.array(pred_y) if not isinstance(true_y, np.ndarray): true_y = np.array(true_y) for i in range(len(true_y)): if true_y[i] == pred_y[i]: if true_y[i] == classes[0]: # True negative == 0 classification_outcome.append(3) else: # True positive == 3 classification_outcome.append(3) else: if true_y[i] == classes[0]: # False negative == 2 classification_outcome.append(2) else: # False positive == 1 classification_outcome.append(1) df[CLASSIFICATION_OUTCOME] = classification_outcome
def get_expected_metric_error(error_analyzer, metric, model, validation_data, y_test): if metric == Metrics.ERROR_RATE: return sum(model.predict(validation_data) != y_test) elif (metric == Metrics.MEAN_SQUARED_ERROR or metric == Metrics.MEAN_ABSOLUTE_ERROR or metric in precision_metrics or metric in recall_metrics or metric in f1_metrics or metric == Metrics.ACCURACY_SCORE): func = metric_to_func[metric] pred_y = model.predict(validation_data) can_be_binary = error_analyzer.model_task == ModelTask.CLASSIFICATION if can_be_binary and metric != Metrics.ACCURACY_SCORE: ordered_labels = get_ordered_classes(error_analyzer.classes, y_test, pred_y) if len(ordered_labels) == 2: return func(y_test, pred_y, pos_label=ordered_labels[1]) return func(y_test, pred_y) else: raise NotImplementedError( "Metric {} validation not supported yet".format(metric))
def compute_metric_value(func, classes, true_y, pred_y, metric): """Compute metric from the given function, true and predicted values. :param func: The metric function to evaluate. :type func: function :param classes: List of classes. :type classes: list :param true_y: True y values. :type true_y: numpy.ndarray :param pred_y: Predicted y values. :type pred_y: numpy.ndarray :param metric: Metric to compute. :type metric: str :return: The computed metric value. :rtype: float """ requires_pos_label = (metric == Metrics.RECALL_SCORE or metric == Metrics.PRECISION_SCORE or metric == Metrics.F1_SCORE) if requires_pos_label: ordered_labels = get_ordered_classes(classes, true_y, pred_y) if ordered_labels is not None and len(ordered_labels) == 2: return func(true_y, pred_y, pos_label=ordered_labels[1]) return func(true_y, pred_y)
def compute_matrix(analyzer, features, filters, composite_filters, quantile_binning=False, num_bins=BIN_THRESHOLD): if num_bins <= 0: raise ValueError( 'Number of bins parameter must be greater than 0 for the heatmap') if features[0] is None and features[1] is None: raise ValueError( 'One or two features must be specified to compute the heat map') filtered_df = filter_from_cohort(analyzer, filters, composite_filters) true_y = filtered_df[TRUE_Y] dropped_cols = [TRUE_Y, ROW_INDEX] is_model_analyzer = hasattr(analyzer, 'model') if not is_model_analyzer: pred_y = filtered_df[PRED_Y] dropped_cols.append(PRED_Y) input_data = filtered_df.drop(columns=dropped_cols) is_pandas = isinstance(analyzer.dataset, pd.DataFrame) metric = analyzer.metric if is_pandas: true_y = true_y.to_numpy() else: input_data = input_data.to_numpy() if is_model_analyzer: pred_y = analyzer.model.predict(input_data) if is_model_analyzer: if analyzer.model_task == ModelTask.CLASSIFICATION: diff = analyzer.model.predict(input_data) != true_y else: diff = analyzer.model.predict(input_data) - true_y else: if analyzer.model_task == ModelTask.CLASSIFICATION: diff = pred_y != true_y else: diff = pred_y - true_y if not isinstance(diff, np.ndarray): diff = np.array(diff) if not isinstance(pred_y, np.ndarray): pred_y = np.array(pred_y) if not isinstance(true_y, np.ndarray): true_y = np.array(true_y) indexes = [] for feature in features: if feature is None: continue indexes.append(analyzer.feature_names.index(feature)) if is_pandas: input_data = input_data.to_numpy() dataset_sub_features = input_data[:, indexes] dataset_sub_names = np.array(analyzer.feature_names)[np.array(indexes)] df = pd.DataFrame(dataset_sub_features, columns=dataset_sub_names) df_err = df.copy() df_err[DIFF] = diff if metric == Metrics.ERROR_RATE: df_err = df_err[df_err[DIFF]] else: df_err[TRUE_Y] = true_y df_err[PRED_Y] = pred_y # construct matrix matrix = [] if len(dataset_sub_names) == 2: feat1 = dataset_sub_names[0] feat2 = dataset_sub_names[1] unique_count1 = len(df[feat1].unique()) unique_count2 = len(df[feat2].unique()) f1_is_cat = False f2_is_cat = False if analyzer.categorical_features is not None: f1_is_cat = feat1 in analyzer.categorical_features f2_is_cat = feat2 in analyzer.categorical_features if unique_count1 > num_bins and not f1_is_cat: tabdf1 = bin_data(df, feat1, num_bins, quantile_binning=quantile_binning) categories1 = tabdf1.cat.categories if len(categories1) < num_bins: warn_duplicate_edges(feat1) tabdf1_err = bin_data(df_err, feat1, categories1, quantile_binning=quantile_binning) else: tabdf1 = df[feat1] tabdf1_err = df_err[feat1] categories1 = np.unique(tabdf1.to_numpy(), return_counts=True)[0] if unique_count2 > num_bins and not f2_is_cat: tabdf2 = bin_data(df, feat2, num_bins, quantile_binning=quantile_binning) categories2 = tabdf2.cat.categories if len(categories2) < num_bins: warn_duplicate_edges(feat2) tabdf2_err = bin_data(df_err, feat2, categories2, quantile_binning=quantile_binning) else: tabdf2 = df[feat2] tabdf2_err = df_err[feat2] categories2 = np.unique(tabdf2.to_numpy(), return_counts=True)[0] if metric == Metrics.ERROR_RATE: matrix_total = pd.crosstab(tabdf1, tabdf2, rownames=[feat1], colnames=[feat2]) matrix_error = pd.crosstab(tabdf1_err, tabdf2_err, rownames=[feat1], colnames=[feat2]) else: if is_multi_agg_metric(metric): ordered_labels = get_ordered_classes(analyzer.classes, true_y, pred_y) aggfunc = _MultiMetricAggFunc(metric_to_func[metric], ordered_labels, metric) else: aggfunc = _AggFunc(metric_to_func[metric]) matrix_total = pd.crosstab(tabdf1, tabdf2, rownames=[feat1], colnames=[feat2]) matrix_error = pd.crosstab(tabdf1_err, tabdf2_err, rownames=[feat1], colnames=[feat2], values=list( zip(df_err[TRUE_Y], df_err[PRED_Y])), aggfunc=aggfunc._agg_func_pair) fill_matrix_nulls(matrix_total, aggfunc._fill_na_value()) fill_matrix_nulls(matrix_error, aggfunc._fill_na_value()) matrix = matrix_2d(categories1, categories2, matrix_total, matrix_error, metric) else: feat1 = dataset_sub_names[0] unique_count1 = len(df[feat1].unique()) f1_is_cat = False if analyzer.categorical_features is not None: f1_is_cat = feat1 in analyzer.categorical_features if unique_count1 > num_bins and not f1_is_cat: cutdf = bin_data(df, feat1, num_bins, quantile_binning=quantile_binning) num_categories = len(cutdf.cat.categories) bin_range = range(num_categories) if len(cutdf.cat.categories) < num_bins: warn_duplicate_edges(feat1) catr = cutdf.cat.rename_categories(bin_range) catn, counts = np.unique(catr.to_numpy(), return_counts=True) # fix counts to include skipped categories fix_counts = [] counts_idx = 0 for idx, catdf in enumerate(cutdf.cat.categories): if idx not in catn: fix_counts.append(0) else: fix_counts.append(counts[counts_idx]) counts_idx += 1 counts = fix_counts cut_err = bin_data(df_err, feat1, cutdf.cat.categories, quantile_binning=quantile_binning) catr_err = cut_err.cat.rename_categories(bin_range) val_err, counts_err = np.unique(catr_err.to_numpy(), return_counts=True) val_err = cut_err.cat.categories[val_err] categories = cutdf.cat.categories else: categories, counts = np.unique(df[feat1].to_numpy(), return_counts=True) val_err, counts_err = np.unique(df_err[feat1].to_numpy(), return_counts=True) cut_err = df_err # Compute the given metric for each group, if not using error rate if metric != Metrics.ERROR_RATE: if is_multi_agg_metric(metric): ordered_labels = get_ordered_classes(analyzer.classes, true_y, pred_y) aggfunc = _MultiMetricAggFunc(metric_to_func[metric], ordered_labels, metric) else: aggfunc = _AggFunc(metric_to_func[metric]) cutdf_err = pd.DataFrame(cut_err) cutdf_err['metric_values'] = list( zip(df_err[TRUE_Y], df_err[PRED_Y])) grouped = cutdf_err.groupby([feat1]) agg_func = {'metric_values': aggfunc._agg_func_triplet} counts_err = grouped.agg(agg_func) counts_err = counts_err.values.ravel() matrix = matrix_1d(categories, val_err, counts, counts_err, metric) return matrix
def compute_matrix(analyzer, features, filters, composite_filters, quantile_binning=False, num_bins=BIN_THRESHOLD): """Compute a matrix of metrics for a given set of feature names. The filters and composite filters are used to filter the data prior to computing the matrix. :param analyzer: The error analyzer. :type analyzer: BaseAnalyzer :param features: A list of one or two feature names to compute metrics for. :type features: list :param filters: A list of filters to apply to the data. :type filters: list :param composite_filters: A list of composite filters to apply to the data. :type composite_filters: list :param quantile_binning: Whether to use quantile binning. :type quantile_binning: bool :param num_bins: The number of bins to use for quantile binning. :type num_bins: int :return: A dictionary representation of the computed matrix which can be saved to JSON. :rtype: dict :Example: An example of running compute_matrix with a filter and a composite filter: >>> from erroranalysis._internal.error_analyzer import ModelAnalyzer >>> from erroranalysis._internal.matrix_filter import ( ... compute_matrix) >>> from erroranalysis._internal.constants import ModelTask >>> from sklearn.datasets import load_breast_cancer >>> from sklearn.model_selection import train_test_split >>> from sklearn import svm >>> breast_cancer_data = load_breast_cancer() >>> feature_names = breast_cancer_data.feature_names >>> X_train, X_test, y_train, y_test = train_test_split( ... breast_cancer_data.data, breast_cancer_data.target, ... test_size=0.5, random_state=0) >>> categorical_features = [] >>> clf = svm.SVC(gamma=0.001, C=100., probability=True, ... random_state=777) >>> model = clf.fit(X_train, y_train) >>> model_task = ModelTask.CLASSIFICATION >>> analyzer = ModelAnalyzer(model, X_test, y_test, feature_names, ... categorical_features, model_task=model_task) >>> filters = [{'arg': [23.85], 'column': 'mean radius', ... 'method': 'less and equal'}] >>> composite_filters = [{'compositeFilters': ... [{'compositeFilters': ... [{'arg': [13.45, 22.27], ... 'column': 'mean radius', ... 'method': 'in the range of'}, ... {'arg': [10.88, 24.46], ... 'column': 'mean texture', ... 'method': 'in the range of'}], ... 'operation': 'and'}], ... 'operation': 'or'}] >>> matrix = compute_matrix(analyzer, ['mean radius', 'mean texture'], ... filters, composite_filters) """ if num_bins <= 0: raise ValueError( 'Number of bins parameter must be greater than 0 for the heatmap') if features[0] is None and features[1] is None: raise ValueError( 'One or two features must be specified to compute the heat map') filtered_df = filter_from_cohort(analyzer, filters, composite_filters) true_y = filtered_df[TRUE_Y] dropped_cols = [TRUE_Y, ROW_INDEX] is_model_analyzer = hasattr(analyzer, 'model') if not is_model_analyzer: pred_y = filtered_df[PRED_Y] dropped_cols.append(PRED_Y) input_data = filtered_df.drop(columns=dropped_cols) is_pandas = isinstance(analyzer.dataset, pd.DataFrame) metric = analyzer.metric if is_pandas: true_y = true_y.to_numpy() else: input_data = input_data.to_numpy() if is_model_analyzer: pred_y = analyzer.model.predict(input_data) if is_model_analyzer: if analyzer.model_task == ModelTask.CLASSIFICATION: diff = analyzer.model.predict(input_data) != true_y else: diff = analyzer.model.predict(input_data) - true_y else: if analyzer.model_task == ModelTask.CLASSIFICATION: diff = pred_y != true_y else: diff = pred_y - true_y if not isinstance(diff, np.ndarray): diff = np.array(diff) if not isinstance(pred_y, np.ndarray): pred_y = np.array(pred_y) if not isinstance(true_y, np.ndarray): true_y = np.array(true_y) indexes = [] for feature in features: if feature is None: continue indexes.append(analyzer.feature_names.index(feature)) if is_pandas: input_data = input_data.to_numpy() dataset_sub_features = input_data[:, indexes] dataset_sub_names = np.array(analyzer.feature_names)[np.array(indexes)] df = pd.DataFrame(dataset_sub_features, columns=dataset_sub_names) df_err = df.copy() df_err[DIFF] = diff if metric == Metrics.ERROR_RATE: df_err = df_err[df_err[DIFF]] else: df_err[TRUE_Y] = true_y df_err[PRED_Y] = pred_y # construct matrix matrix = [] if len(dataset_sub_names) == 2: feat1 = dataset_sub_names[0] feat2 = dataset_sub_names[1] unique_count1 = len(df[feat1].unique()) unique_count2 = len(df[feat2].unique()) f1_is_cat = False f2_is_cat = False if analyzer.categorical_features is not None: f1_is_cat = feat1 in analyzer.categorical_features f2_is_cat = feat2 in analyzer.categorical_features if unique_count1 > num_bins and not f1_is_cat: tabdf1 = bin_data(df, feat1, num_bins, quantile_binning=quantile_binning) categories1 = tabdf1.cat.categories if len(categories1) < num_bins: warn_duplicate_edges(feat1) tabdf1_err = bin_data(df_err, feat1, categories1, quantile_binning=quantile_binning) else: tabdf1 = df[feat1] tabdf1_err = df_err[feat1] categories1 = np.unique(tabdf1.to_numpy(), return_counts=True)[0] if unique_count2 > num_bins and not f2_is_cat: tabdf2 = bin_data(df, feat2, num_bins, quantile_binning=quantile_binning) categories2 = tabdf2.cat.categories if len(categories2) < num_bins: warn_duplicate_edges(feat2) tabdf2_err = bin_data(df_err, feat2, categories2, quantile_binning=quantile_binning) else: tabdf2 = df[feat2] tabdf2_err = df_err[feat2] categories2 = np.unique(tabdf2.to_numpy(), return_counts=True)[0] if metric == Metrics.ERROR_RATE: matrix_total = pd.crosstab(tabdf1, tabdf2, rownames=[feat1], colnames=[feat2]) matrix_error = pd.crosstab(tabdf1_err, tabdf2_err, rownames=[feat1], colnames=[feat2]) else: if is_multi_agg_metric(metric): ordered_labels = get_ordered_classes(analyzer.classes, true_y, pred_y) aggfunc = _MultiMetricAggFunc(metric_to_func[metric], ordered_labels, metric) else: aggfunc = _AggFunc(metric_to_func[metric]) matrix_total = pd.crosstab(tabdf1, tabdf2, rownames=[feat1], colnames=[feat2]) matrix_error = pd.crosstab(tabdf1_err, tabdf2_err, rownames=[feat1], colnames=[feat2], values=list( zip(df_err[TRUE_Y], df_err[PRED_Y])), aggfunc=aggfunc._agg_func_pair) fill_matrix_nulls(matrix_total, aggfunc._fill_na_value()) fill_matrix_nulls(matrix_error, aggfunc._fill_na_value()) matrix = matrix_2d(categories1, categories2, matrix_total, matrix_error, metric) else: feat1 = dataset_sub_names[0] unique_count1 = len(df[feat1].unique()) f1_is_cat = False if analyzer.categorical_features is not None: f1_is_cat = feat1 in analyzer.categorical_features if unique_count1 > num_bins and not f1_is_cat: cutdf = bin_data(df, feat1, num_bins, quantile_binning=quantile_binning) num_categories = len(cutdf.cat.categories) bin_range = range(num_categories) if len(cutdf.cat.categories) < num_bins: warn_duplicate_edges(feat1) catr = cutdf.cat.rename_categories(bin_range) catn, counts = np.unique(catr.to_numpy(), return_counts=True) # fix counts to include skipped categories fix_counts = [] counts_idx = 0 for idx, catdf in enumerate(cutdf.cat.categories): if idx not in catn: fix_counts.append(0) else: fix_counts.append(counts[counts_idx]) counts_idx += 1 counts = fix_counts cut_err = bin_data(df_err, feat1, cutdf.cat.categories, quantile_binning=quantile_binning) catr_err = cut_err.cat.rename_categories(bin_range) val_err, counts_err = np.unique(catr_err.to_numpy(), return_counts=True) val_err = cut_err.cat.categories[val_err] categories = cutdf.cat.categories else: categories, counts = np.unique(df[feat1].to_numpy(), return_counts=True) val_err, counts_err = np.unique(df_err[feat1].to_numpy(), return_counts=True) cut_err = df_err # Compute the given metric for each group, if not using error rate if metric != Metrics.ERROR_RATE: if is_multi_agg_metric(metric): ordered_labels = get_ordered_classes(analyzer.classes, true_y, pred_y) aggfunc = _MultiMetricAggFunc(metric_to_func[metric], ordered_labels, metric) else: aggfunc = _AggFunc(metric_to_func[metric]) cutdf_err = pd.DataFrame(cut_err) cutdf_err['metric_values'] = list( zip(df_err[TRUE_Y], df_err[PRED_Y])) grouped = cutdf_err.groupby([feat1]) agg_func = {'metric_values': aggfunc._agg_func_grouped} counts_err = grouped.agg(agg_func) counts_err = counts_err.values.ravel() matrix = matrix_1d(categories, val_err, counts, counts_err, metric) return matrix