def measure(key: str, ground_truth: list, data: list, uncertainty: str, bins: int): """ Measure miscalibration (batched mode) """ print("Measure: %s" % key) try: confidence = [x[key] for x in data] except KeyError: return np.nan ece = ECE(bins=bins, detection=False) miscalibration = [] for conf, gt in zip(confidence, ground_truth): if conf.ndim == 3: if uncertainty == 'mean': conf = np.mean(conf, axis=0) elif uncertainty == 'flatten': gt = np.tile(gt, conf.shape[0]).flatten() conf = conf.flatten() else: raise AttributeError("Unknown type of uncertainty handling: %s." % uncertainty) miscalibration.append(ece.measure(conf, gt)) return np.mean(miscalibration)
def evaluation_metrics(self, n_bins=10, verbose=True): """ Calculates proper losses, calibration error metrics, and the macro weighted F1-score (to evaluate predictive performance). Expected Calibration Error: Discretize the probability interval into a fixed number of bins and assign predicted probabilities to each bin. The calibration error is the difference between the number of correct probabilities (accuracy) and the mean of all probabilities (confidence) for each bin. Classwise ECE: The ECE calculated for each class. Adaptive ECE: The Adaptive ECE focuses on those bins where predictions are made rather than weighing all bins equally. This metric spaces the bin intervals in such a way that each contains an equal number of predictions. Brier Score: "The Brier score measures the mean squared difference between (1) the predicted probability assigned to the possible outcomes for item i, and (2) the actual outcome. Therefore, the lower the Brier score is for a set of predictions, the better the predictions are calibrated." sklearn metrics Negative Log-Likelihood: The NLL also average the error on every single instance to calculate the calibration error. F1-Macro weighted: The weighted average of the precision and recall. Calculate metrics for each label, and find their unweighted mean. Parameters: ---------------- probs: np.ndarray, shape=(n_samples, 3) Estimated confidences (probabilities) from classifier. y: np.ndarray, shape=(n_samples,) NumPy 1-D array with ground truth labels. method: str, Calibration method used. n_bins: n_bins: int, default: 10 Discretize the probability interval into a fixed number of bins and assign predicted probabilities to each bin. verbose: bool, default: True Print metrics as output. Returns: Dataframe with all evaluation metrics. """ ece = ECE(n_bins) ece_score = ece.measure(self.calibrated, self.labels) classwise_ece = calc_classwise_ece(self.calibrated, self.labels) dd_ece = stats.ece(self.calibrated, one_hot( self.labels), binning=binning.DataDependentBinning()) brier = brier_multi(self.labels, self.calibrated) nll = log_loss(self.labels, self.calibrated) f1 = f1_score(self.labels, np.argmax(self.calibrated, axis=1), average="macro") df = pd.DataFrame(columns=['ECE', 'Classwise ECE', 'Adaptive ECE', 'Brier', 'Neg Log-Likelihood', 'F1-Macro']) df.loc[0] = ece_score, classwise_ece, dd_ece, brier, nll, f1 if verbose: print(self.method + ' - Calibration Metrics') print('-'*50) print('ECE: ', round(ece_score, 4)) print('Classwise/ Static ECE: ', round(classwise_ece, 4)) print('Adaptive ECE: ', round(dd_ece, 4)) print('Brier Multi Score: ', round(brier, 4)) print('f1 - macro: ', round(f1, 4)) print('Negative Log-Likelihood: ', round(nll, 4)) return df
def predict_ece_logloss(self, X, y, bins=10, mode='map'): preds_probs = self.predict_proba(X, mode=mode) #print(preds_probs, preds_probs.shape) ece = ECE(bins) calibrated_score = ece.measure(preds_probs, y) #print(y, preds_probs) return calibrated_score, log_loss(y, preds_probs, labels=[0, 1])
def eval_cal(y_preds, y_true, bins=15): # Calibration Metrics ece = ECE(bins) ace = ACE(bins) mce = MCE(bins) ece_score = ece.measure(y_preds, y_true) ace_score = ace.measure(y_preds, y_true) mce_score = mce.measure(y_preds, y_true) return ece_score, ace_score, mce_score
def script(): ''' LOAD DATA ''' data_path = '../data/' train_data = pd.read_csv(data_path + 'v3.2.2_train.csv') test_data = pd.read_csv(data_path + 'v3.2.2_test.csv') y_train, y_test = train_data[['TOT_fast', 'TOT_med_fast', 'TOT_med', 'TOT_med_slow', 'TOT_slow']], \ test_data[['TOT_fast', 'TOT_med_fast', 'TOT_med', 'TOT_med_slow', 'TOT_slow']] X_train, X_test = train_data.drop( ['Unnamed: 0', 'TOT_fast', 'TOT_med_fast', 'TOT_med', 'TOT_med_slow', 'TOT_slow'], axis=1), \ test_data.drop( ['Unnamed: 0', 'TOT_fast', 'TOT_med_fast', 'TOT_med', 'TOT_med_slow', 'TOT_slow'], axis=1) model = load_model("../network/models/v3.2.2/model.h5") print("# of train samples: ", len(y_train.index)) print("# of test samples: ", len(y_test.index)) ##Using NetCal package n_bins = 10 confidences = model.predict(X_test.values) ece = ECE(n_bins) uncalibrated_score = ece.measure(confidences, y_test.values.argmax(axis=1)) print("Calibration Error before calibration: ", uncalibrated_score) temperature = TemperatureScaling() temperature.fit(confidences, y_test.values.argmax(axis=1)) calibrated = temperature.transform(confidences) ece = ECE(n_bins) calibrated_score = ece.measure(calibrated, y_test.values.argmax(axis=1)) print("Calibration Error after calibration: ", calibrated_score) diagram = ReliabilityDiagram(n_bins) diagram.plot(confidences, y_test.values.argmax( axis=1)) # visualize miscalibration of uncalibrated diagram.plot(calibrated, y_test.values.argmax( axis=1)) # visualize miscalibration of calibrated np.savetxt('./calibration-data/test_calibrated_v3.2.2.csv', calibrated, delimiter=',')
def measure_miscalibration(bins: Union[tuple, list, int], data: dict, methods0d: list, methods2d: list): """ Measure miscalibration and write to stdout. Parameters ---------- bins : iterable or int Number of bins used by ACE, ECE and MCE. data : dict Dictionary of calibration data. methods0d : list List with strings containing the keys for the calibration data (confidence only methods). methods2d : list List with strings containing the keys for the calibration data (2D methods). """ # iterate over 0D and 2D methods for i, methods in enumerate([methods0d, methods2d]): # insert 'confidence' key to the first place in the list to keep track of default miscalibration if i == 1: methods = ['confidence'] + methods0d + methods2d else: methods = ['confidence'] + methods # on confidence only, use one single value (the first one) bins = bins[0] if i == 0 and isinstance(bins, (tuple, list)) else bins # create instances for measuring miscalibration ace = ACE(bins=bins, detection=True) ece = ECE(bins=bins, detection=True) mce = MCE(bins=bins, detection=True) # initialize empty lists ace_list = [] ece_list = [] mce_list = [] # iterate over all methods for method in methods: data_input = data[method] if i == 0 else np.stack( (data[method], data['cx'], data['cy']), axis=1) ace_list.append(ace.measure(data_input, data['matched'])) ece_list.append(ece.measure(data_input, data['matched'])) mce_list.append(mce.measure(data_input, data['matched'])) # output formatted ECE names = [len(x) for x in methods] buffer = max(names) # write out all miscalibration results in a 'pretty' manner for j, method in enumerate(methods): fill = (buffer - len(method)) * " " print("%s%s ACE: %.5f - ECE: %.5f - MCE: %.5f" % (method, fill, ace_list[j], ece_list[j], mce_list[j]))
def calibrate(model, valid_loader, test_loader, n_bins=15): """ Calibrate the model via temperature scaling """ confidence, labels = rollout_loader(model, valid_loader) test_confidence, test_labels = rollout_loader(model, test_loader) temperature = TemperatureScaling() temperature.fit(confidence, labels) calibrated = temperature.transform(test_confidence) ece = ECE(n_bins) calibrated_score = ece.measure(calibrated, test_labels) return calibrated_score
def cross_validate_temp_scaling(model, data_loader, batch_size, k=5, seed=0, num_workers=0, n_bins=15, pin_memory=False): """ Perform temperature scaling on the model with k-fold cross validation """ print("Computing model calibration", flush=True) test_dataset = data_loader.dataset num_test = len(test_dataset) indices = list(range(num_test)) np.random.seed(seed) np.random.shuffle(indices) idxs = torch.tensor(indices).split(int(len(indices) / k))[:k] # get the uncalibrated ECE confidence, labels = rollout_loader(model, data_loader) ece = ECE(n_bins) unscaled_ece = ece.measure(confidence, labels) print(f'ECE: {unscaled_ece:.3f}') # compute the calibrated ECE scaled_eces = [] # for each of the k folds for i in range(k): valid_idx = idxs[i] before = torch.cat(idxs[:i]) if i is not 0 else torch.tensor([], dtype=torch.long) after = torch.cat(idxs[i + 1:]) if i + 1 is not k else torch.tensor([], dtype=torch.long) test_idx = torch.cat([before, after]) # create data loaders test_sampler = SubsetRandomSampler(test_idx) valid_sampler = SubsetRandomSampler(valid_idx) test_loader = DataLoader( test_dataset, batch_size=batch_size, sampler=test_sampler, num_workers=num_workers, pin_memory=pin_memory ) valid_loader = DataLoader( test_dataset, batch_size=batch_size, sampler=valid_sampler, num_workers=num_workers, pin_memory=pin_memory ) scaled_ece = calibrate(model, valid_loader, test_loader, n_bins) print(f'Cross validation fold {i}, temperature scaled ECE: {scaled_ece:.3f}') scaled_eces.append(scaled_ece) mean_scaled_ece = np.mean(scaled_eces) return unscaled_ece, mean_scaled_ece
def single_example(models: list, datafile: str, bins: int, diagram: str = None, validation_split: float = 0.7, save_models: bool = False, domain: str = ".") -> int: """ Measure miscalibration of given methods on specified dataset. Parameters ---------- models : list List of tuples with [('<name>', <instance of CalibrationMethod>), ...]. datafile : str Path to datafile which contains two NumPy arrays with keys 'ground_truth' and 'predictions'. bins : int Number of bins used by ECE, MCE and ReliabilityDiagram. diagram : str, optional, default: None Type of diagram wich should be plotted. This could be 'diagram', 'curve', 'inference' or None. validation_split : float Split ratio between build set and validation set. save_models : bool True if instances of calibration methods should be stored. domain : str, optional, default: "." Domain/directory where to store the results. Returns ------- int 0 on success, -1 otherwise """ if not os.path.exists(datafile): print("Dataset \'%s\' does not exist" % datafile) return -1 # read NumPy input files try: with open(datafile, "rb") as open_file: npzfile = np.load(open_file) ground_truth = npzfile['ground_truth'].squeeze() predictions = npzfile['predictions'].squeeze() except KeyError: print( "Key \'ground_truth\' or \'predictions\' not found in file \'%s\'" % datafile) return -1 # split data set into build set and validation set build_set_gt, validation_set_gt, build_set_sm, validation_set_sm = train_test_split( ground_truth, predictions, test_size=validation_split, stratify=ground_truth, random_state=None) # initialize error metrics ace = ACE(bins) ece = ECE(bins) mce = MCE(bins) predictions = [] all_ace = [ace.measure(validation_set_sm, validation_set_gt)] all_ece = [ece.measure(validation_set_sm, validation_set_gt)] all_mce = [mce.measure(validation_set_sm, validation_set_gt)] # ------------------------------------------ # build and save models for model in models: name, instance = model print("Build %s model" % name) instance.fit(build_set_sm, build_set_gt) if save_models: instance.save_model("%s/models/%s.pkl" % (domain, name)) # ------------------------------------------ # perform predictions for model in models: _, instance = model prediction = instance.transform(validation_set_sm) predictions.append(prediction) all_ace.append(ace.measure(prediction, validation_set_gt)) all_ece.append(ece.measure(prediction, validation_set_gt)) all_mce.append(mce.measure(prediction, validation_set_gt)) # ------------------------------------------ # output formatted ECE names = [len(x[0]) for x in models] buffer = max(names) fill = (buffer - len("Default")) * " " print("%s%s ACE: %.5f - ECE: %.5f - MCE: %.5f" % ("Default", fill, all_ace[0], all_ece[0], all_mce[0])) for i, model in enumerate(models, start=1): name, instance = model fill = (buffer - len(name)) * " " print("%s%s ACE: %.5f - ECE: %.5f - MCE: %.5f" % (name, fill, all_ace[i], all_ece[i], all_mce[i])) # ------------------------------------------ if diagram == 'diagram': diagram = ReliabilityDiagram(bins=bins, title_suffix="default") diagram.plot(validation_set_sm, validation_set_gt, filename="test.png") for i, prediction in enumerate(predictions): diagram = ReliabilityDiagram(bins=bins, title_suffix=models[i][0]) diagram.plot(prediction, validation_set_gt) elif diagram is None: pass else: print("Unknown diagram type \'%s\'" % diagram) return -1 return 0
def transform(frames: List[Dict], dataset: str, network: str, subset: List, ious: List, test_ids: List[int]): """ After calibration training, evaluate the trained models by several miscalibration metrics. These metrics are: D-ECE, Brier, NLL. Also capture area under precision-recall curve (AUPRC). All results are stored at "./output/<network>". Parameters ---------- frames : List[Dict] List of dictionaries holding the input data for each image frame. dataset : str String of the used dataset (see detectron2 registered datasets). network : str String describing the base neural network. subset : List[str] List with additional features used for calibration. Options are: - 'cx' - 'cy' - 'w' - 'h' ious : List[float] List with IoU scores used for evaluation. test_ids : List List of data frame ids used for calibration testing. """ # get meta information and specify all relevant paths meta = MetadataCatalog.get(dataset) model_dir = os.path.join("calibration", network, "models") output_dir = os.path.join("output", network) diagram_path = os.path.join( output_dir, "diagrams", ''.join(subset) if len(subset) > 0 else "confidence") os.makedirs(output_dir, exist_ok=True) os.makedirs(diagram_path, exist_ok=True) # calibration methods that have also been used for calibration training methods = [("histogram", HistogramBinning), ("lr", LogisticCalibration), ("lr_dependent", LogisticCalibrationDependent), ("betacal", BetaCalibration), ("betacal_dependent", BetaCalibrationDependent)] # reverse mapping of category ids to network class ids (e.g. for COCO dataset) if hasattr(meta, "thing_dataset_id_to_contiguous_id"): reverse_dictionary = { v: k for k, v in meta.thing_dataset_id_to_contiguous_id.items() } else: reverse_dictionary = None # lists and placeholders for evaluation metrics n_samples_total = 0 n_samples_per_class = [] dece_per_class = [[[] for _ in ious] for _ in range(len(methods) + 1)] brier_per_class = [[[] for _ in ious] for _ in range(len(methods) + 1)] nll_per_class = [[[] for _ in ious] for _ in range(len(methods) + 1)] average_precision = [[[] for _ in ious] for _ in range(len(methods) + 1)] # ----------------------------------------------------- # visualization routine diagram0d = ReliabilityDiagram(bins=20, detection=True, sample_threshold=8) diagram1d = ReliabilityDiagram(bins=[5, 15], detection=True, sample_threshold=3, fmin=0, fmax=0.3) diagram2d = ReliabilityDiagram(bins=[6, 9, 9], detection=True, sample_threshold=2, fmin=0, fmax=0.3) def plot(f: np.ndarray, m: np.ndarray, title: str, formatter: str): # Define function for diagram output # plot baseline miscalibration figures = [ diagram0d.plot(f[:, :1], m, tikz=False, title_suffix=title, filename=formatter % "0d") ] # plot all additional features in 1D miscalibration plots for i, fname in enumerate(['cx', 'cy', 'w', 'h']): figures.append( diagram1d.plot(f[:, (0, i + 1)], m, tikz=False, feature_names=[fname], title_suffix=title, filename=formatter % ("1d_%s" % fname))) # finally, plot all feature combinations of size 2 for (i, fname1), (j, fname2) in itertools.combinations( enumerate(['cx', 'cy', 'w', 'h']), 2): figures.append( diagram2d.plot(f[:, (0, i + 1, j + 1)], m, tikz=False, feature_names=[fname1, fname2], title_suffix=title, filename=formatter % ("2d_%s_%s" % (fname1, fname2)))) # free memory space for fig in figures: plt.close(fig) # ----------------------------------------------------- # iterate over all classes that are present in the current dataset for i, classname in enumerate(meta.thing_classes): # get calibration features for selected class category_id = reverse_dictionary[ i] if reverse_dictionary is not None else i features, matched, img_ids = get_features(frames, category_id, subset, ious, test_ids) all_features, _, _ = get_features(frames, category_id, ['cx', 'cy', 'w', 'h'], ious, test_ids) if features.size == 0: print("No samples for category %s found" % classname) continue # different binning schemes for different feature dimensions if features.shape[1] == 1: bins = 20 elif features.shape[1] == 3: bins = 8 elif features.shape[1] == 5: bins = 5 else: raise ValueError("Unknown dimension: %d" % features.shape[1]) # define D-ECE metric dece = ECE(bins=bins, detection=True, sample_threshold=8) n_samples_per_class.append(features.shape[0]) n_samples_total += features.shape[0] # failed flag is required to optionally blank failed or non-present classes during evaluation # i.e., if a metric returns NaN failed = False # perform evaluation for each category separately print("Inference: category %d: %d samples" % (category_id, features.shape[0])) for j, (iou, m) in enumerate(zip(ious, matched)): score = average_precision_score(m, features[:, 0]) if not np.isfinite(score) or np.isnan(score): brier_per_class[0][j].append(0.) nll_per_class[0][j].append(0.) dece_per_class[0][j].append(0.) average_precision[0][j].append(0.) failed = True # compute average precision, Brier, NLL and ECE else: brier_per_class[0][j].append( np.mean(np.square(features[:, 0] - m))) nll_per_class[0][j].append( -np.mean(m * np.log(features[:, 0]) + (1. - m) * np.log(1. - features[:, 0]))) dece_per_class[0][j].append(dece.measure(features, m)) average_precision[0][j].append(score) diagramname = os.path.join( diagram_path, "default_cls-%02d_iou%.2f" % (i, iou) + "_%s.tex") plot(all_features, m, title="default", formatter=diagramname) # start calibration evaluation for each method separately for k, (name, method) in enumerate(methods, start=1): instance = method() try: print("Load %s and transform" % name) instance.load_model( os.path.join( model_dir, "%s_%s_iou%.2f_cls-%02d.pkl" % (name, ''.join(subset), iou, i))) calibrated = instance.transform(features) # perform clipping np.clip(calibrated, np.finfo(np.float32).eps, 1. - np.finfo(np.float32).eps, out=calibrated) score = average_precision_score(m, calibrated) if not np.isfinite(score) or np.isnan(score): raise ValueError("Couldn't compute AUPRC score") average_precision[k][j].append(score) brier_per_class[k][j].append( np.mean(np.square(calibrated - m))) nll_per_class[k][j].append( -np.mean(m * np.log(calibrated) + (1. - m) * np.log(1. - calibrated))) input = np.concatenate( (np.reshape(calibrated, (-1, 1)), features[:, 1:]), axis=1) dece_per_class[k][j].append(dece.measure(input, m)) diagramname = os.path.join( diagram_path, "%s_cls-%02d_iou%.2f" % (name, i, iou) + "_%s.tex") input = np.concatenate( (np.reshape(calibrated, (-1, 1)), all_features[:, 1:]), axis=1) plot(input, m, title=name, formatter=diagramname) except (FileNotFoundError, ValueError): print( "Could not find weight file ", os.path.join( model_dir, "%s_%s_iou%.2f_cls-%02d.pkl" % (name, ''.join(subset), iou, i))) print("Disable evaluation for class %d" % i) brier_per_class[k][j].append(0.) nll_per_class[k][j].append(0.) dece_per_class[k][j].append(0.) average_precision[k][j].append(0.) failed = True if failed: n_samples_total -= n_samples_per_class[-1] n_samples_per_class[-1] = 0 # convert all lists to NumPy arrays weights = np.array(n_samples_per_class) / n_samples_total brier_per_class = np.array(brier_per_class) nll_per_class = np.array(nll_per_class) dece_per_class = np.array(dece_per_class) average_precision = np.array(average_precision) # compute a feed-forward average and and a weighted counter-part brier_global = np.mean(brier_per_class, axis=2) weighted_brier_global = np.average(brier_per_class, weights=weights, axis=2) nll_global = np.mean(nll_per_class, axis=2) weighted_nll_global = np.average(nll_per_class, weights=weights, axis=2) dece_global = np.mean(dece_per_class, axis=2) weighted_dece_global = np.average(dece_per_class, weights=weights, axis=2) average_precision_macro = np.mean(average_precision, axis=2) average_precision_weighted = np.average(average_precision, weights=weights, axis=2) # use tabulate library to visualize the evaluation results header = [] body = [['default']] body.extend([[name] for name, method in methods]) for i, iou in enumerate(ious): header.extend([ 'D-ECE(w) @ IoU %.2f' % iou, 'D-ECE @ IoU %.2f' % iou, 'Brier(w) @ IoU %.2f' % iou, 'Brier @ IoU %.2f' % iou, 'NLL(w) @ IoU %.2f' % iou, 'NLL @ IoU %.2f' % iou, 'AP(w) @ IoU %.2f' % iou, 'AP @ IoU %.2f' ]) body[0].extend([ weighted_dece_global[0][i], dece_global[0][i], weighted_brier_global[0][i], brier_global[0][i], weighted_nll_global[0][i], nll_global[0][i], average_precision_weighted[0][i], average_precision_macro[0][i] ]) for k, (name, method) in enumerate(methods): body[k + 1].extend([ weighted_dece_global[k + 1][i], dece_global[k + 1][i], weighted_brier_global[k + 1][i], brier_global[k + 1][i], weighted_nll_global[k + 1][i], nll_global[k + 1][i], average_precision_weighted[k + 1][i], average_precision_macro[k + 1][i] ]) results = [header, *body] # also write the evaluation results to CSV format print("\nEvaluation Results:") print(tabulate(results, headers="firstrow")) with open(os.path.join(output_dir, "results_%s.csv" % ''.join(subset)), "w") as open_file: writer = csv.writer(open_file) writer.writerow([ "method", ] + results[0]) writer.writerows(results[1:])
def predict_ece(self, X, y, mode='map', bins=10): ece = ECE(bins) calibrated_score = ece.measure(self.predict_proba(X, mode=mode), y) return calibrated_score
def predict_ece_logloss(self, X, y, bins=10): preds_probs = self.predict_proba(X) ece = ECE(bins) calibrated_score = ece.measure(preds_probs, y) #print(calibrated_score, y, preds_probs) return calibrated_score, log_loss(y, preds_probs, labels=[0, 1])
lw=1, color='red') ax1.set_ylabel("Fraction of positives") ax1.set_ylim([-0.05, 1.05]) ax1.legend(loc="upper left") ax1.set_title('Calibration plots (reliability curve)') ax2.set_xlabel("Mean predicted value") ax2.set_ylabel("Count") ax2.legend(loc="upper center", ncol=2) plt.tight_layout() #Tempreture scaling for probability calibration using netcal package from netcal.scaling import TemperatureScaling temperature = TemperatureScaling() temperature.fit(y_prob, y_all) calibrated = temperature.transform(y_prob) #Computing the expected calibration error from netcal.metrics import ECE from netcal.presentation import ReliabilityDiagram n_bins = 10 ece = ECE(n_bins) uncalibrated_score = ece.measure(y_new, y_test) calibrated_score = ece.measure(calibrated, y_test) diagram = ReliabilityDiagram(n_bins) diagram.plot(y_new, y_test) # visualize miscalibration of uncalibrated diagram.plot(calibrated, y_test) # visualize miscalibration of calibrated
from netcal.metrics import ECE from utils_constants import CORRECTNESS, A, B, C, D, LABEL from utils_data import create_calibrated_df random_state = 42 split = 'test' n_bins = 10 for random_seed in [1, 2, 3, 4, 5]: df = create_calibrated_df( ['output_xlnet_seed_%d_%s.csv' % (random_seed, split)]) ece = ECE(n_bins) uncalibrated_score = ece.measure(df[[A, B, C, D]].values, df[LABEL].values) print('XLNET %d: ECE = %.4f' % (random_seed, float(uncalibrated_score))) for random_seed in [0, 1, 2, 3, 42]: df = create_calibrated_df( ['output_distilbert_seed%d_%s.csv' % (random_seed, split)]) ece = ECE(n_bins) uncalibrated_score = ece.measure(df[[A, B, C, D]].values, df[LABEL].values) print('DistilBERT %d: ECE = %.4f' % (random_seed, float(uncalibrated_score))) for random_seed in [0, 1, 2, 3, 42]: df = create_calibrated_df( ['output_bert_seed%d_%s.csv' % (random_seed, split)]) ece = ECE(n_bins) uncalibrated_score = ece.measure(df[[A, B, C, D]].values, df[LABEL].values) print('BERT %d: ECE = %.4f' % (random_seed, float(uncalibrated_score)))
accs[k].append(0) xs = [] ys = [] for k in bins.keys(): xs.append(np.mean(bins[k])) ys.append(np.mean(accs[k])) ax1.annotate(str(len(accs[k])), (np.mean(bins[k]), np.mean(accs[k])), color=color) ax1.plot(xs, ys, label=title, color=color) if args.mutual_info: with open(title + '.pkl', 'rb') as fp: ys2 = pickle.load(fp) ax2.plot(list(range(1, len(ys2) * 3 + 1, 3)), ys2, color=color) ece = ECE(args.ece_bins) ece_score = ece.measure(X, y) acc = 100. * correct / total print('Testing LOS_LOSS:', np.mean(test_loss)) print('Testing ACCURACY:', acc) print('Testing ECE:', ece_score) results_dict['logloss'] = np.mean(test_loss) results_dict['acc'] = acc results_dict['ece'] = ece_score results_df = results_df.append(results_dict, ignore_index=True) f = open(main_path + "evaluation_logs.txt", "a") f.write( '###################################################################\n' + 'P:' + p + '\n') f.write('Mean Testing LOG_LOSS:' + str(np.mean(test_loss)) + '\n' + 'Testing ACCURACY:' + str(acc) + '\n' + 'Testing ECE:' + str(ece_score) + '\n')
def ece_score(y_true, y_prob, n_bins=10): ece = ECE(n_bins) ece_val = ece.measure(y_prob, y_true) return ece_val
'min_beta': min_beta }) predictions = unc_model.predict( [test_mu_predictions, stl10_x_test_resized]) logger.error("Compute predictions") sess = K.get_session() logger.error("Compute mu pred. entropy") probs = sess.run(predict_probs(predictions)) accuracy = accuracy_score(stl10_y_test, np.argmax(predictions, axis=1)) logger.error("Resulting accuracy: {}".format(accuracy)) n_bins = 10 ground_truth = stl10_y_test confidences = test_mu_predictions temperature = TemperatureScaling() temperature.fit(confidences, ground_truth) calibrated = temperature.transform(confidences) n_bins = 10 ece = ECE(n_bins) uncalibrated_score = ece.measure(confidences, ground_truth) calibrated_score = ece.measure(calibrated, ground_truth) wrapper_score = ece.measure(probs, ground_truth) logger.error("ECE scores: {}, {}, {}".format(uncalibrated_score, calibrated_score, wrapper_score)) logger.error("Done")
def cross_validation_5_2(models: list, datafile: str, bins: int, save_models: bool = False, domain: str = '.') -> int: """ 5x2 cross validation on given methods on specified dataset. Parameters ---------- models : list List of tuples with [('<name>', <instance of CalibrationMethod>), ...]. datafile : str Path to datafile which contains two NumPy arrays with keys 'ground_truth' and 'predictions'. bins : int Number of bins used by ECE, MCE and ReliabilityDiagram. save_models : bool, optional, default: False True if instances of calibration methods should be stored. domain : str, optional, default: "." Domain/directory where to store the results. Returns ------- int 0 on success, -1 otherwise """ network = datafile[datafile.rfind("/") + 1:datafile.rfind(".npz")] seeds = [60932, 29571058, 127519, 23519410, 74198274] if not os.path.exists(datafile): print("Dataset \'%s\' does not exist" % datafile) return -1 # read NumPy input files try: with open(datafile, "rb") as open_file: npzfile = np.load(open_file) ground_truth = npzfile['ground_truth'].squeeze() predictions = npzfile['predictions'].squeeze() except KeyError: print( "Key \'ground_truth\' or \'predictions\' not found in file \'%s\'" % datafile) return -1 if len(predictions.shape) == 2: n_classes = predictions.shape[1] else: n_classes = 2 # initialize error metrics ace = ACE(bins) ece = ECE(bins) mce = MCE(bins) all_accuracy = [] all_ace = [] all_ece = [] all_mce = [] it = 0 for i, seed in enumerate(seeds): np.random.seed(seed) # split data set into build set and validation set build_set_gt, validation_set_gt, build_set_sm, validation_set_sm = train_test_split( ground_truth, predictions, random_state=seed, test_size=0.5, stratify=ground_truth) for j in range(2): calibrated_data = {} # 5x2 cross validation - flip build/val set after each iteration build_set_gt, validation_set_gt = validation_set_gt, build_set_gt build_set_sm, validation_set_sm = validation_set_sm, build_set_sm # lists for error metrics for current iteration (it) it_all_accuracy = [] it_all_ace = [] it_all_ece = [] it_all_mce = [] if n_classes > 2: labels = np.argmax(validation_set_sm, axis=1) else: labels = np.where(validation_set_sm > 0.5, np.ones_like(validation_set_gt), np.zeros_like(validation_set_gt)) accuracy = np.mean( np.where(labels == validation_set_gt, np.ones_like(labels), np.zeros_like(labels))) it_all_accuracy.append(accuracy) it_all_ace.append(ace.measure(validation_set_sm, validation_set_gt)) it_all_ece.append(ece.measure(validation_set_sm, validation_set_gt)) it_all_mce.append(mce.measure(validation_set_sm, validation_set_gt)) # ------------------------------------------ # build and save models for model in models: name, instance = model print("Build %s model" % name) instance.fit(build_set_sm, build_set_gt) if save_models: instance.save_model("%s/models/%s-%s-%d.pkl" % (domain, network, name, i)) prediction = instance.transform(validation_set_sm) calibrated_data[name] = prediction if n_classes > 2: if prediction.ndim == 3: prediction = np.mean(prediction, axis=0) labels = np.argmax(prediction, axis=1) else: if prediction.ndim == 2: prediction = np.mean(prediction, axis=0) labels = np.where(prediction > 0.5, np.ones_like(validation_set_gt), np.zeros_like(validation_set_gt)) accuracy = np.mean( np.where(labels == validation_set_gt, np.ones_like(labels), np.zeros_like(labels))) it_all_accuracy.append(accuracy) it_all_ace.append(ace.measure(prediction, validation_set_gt)) it_all_ece.append(ece.measure(prediction, validation_set_gt)) it_all_mce.append(mce.measure(prediction, validation_set_gt)) # append lists of current iterations all_accuracy.append(it_all_accuracy) all_ace.append(it_all_ace) all_ece.append(it_all_ece) all_mce.append(it_all_mce) filename = "%s/results/%s_%02d.npz" % (domain, network, it) with open(filename, "wb") as open_file: np.savez_compressed(open_file, train_gt=build_set_gt, test_gt=validation_set_gt, train_scores=build_set_sm, test_scores=validation_set_sm, **calibrated_data) it += 1 # convert to NumPy arrays and reduce mean afterwards all_accuracy = np.array(all_accuracy) all_ace = np.array(all_ace) all_ece = np.array(all_ece) all_mce = np.array(all_mce) all_accuracy = np.mean(all_accuracy, axis=0) all_ace = np.mean(all_ace, axis=0) all_ece = np.mean(all_ece, axis=0) all_mce = np.mean(all_mce, axis=0) names = [len(x[0]) for x in models] buffer = max(names) # --------------------------------------------------------- # output formatted ECE fill = (buffer - len("Default")) * " " print( "%s%s Accuracy: %.5f - ACE: %.5f - ECE: %.5f - MCE: %.5f" % ("Default", fill, all_accuracy[0], all_ace[0], all_ece[0], all_mce[0])) # --------------------------------------------------------- for i, model in enumerate(models, start=1): name, instance = model fill = (buffer - len(name)) * " " print( "%s%s Accuracy: %.5f - ACE: %.5f - ECE: %.5f - MCE: %.5f" % (name, fill, all_accuracy[i], all_ace[i], all_ece[i], all_mce[i])) return 0
def cross_validation_5_2(models: list, datafile: str, bins: int, save_models: bool = False) -> int: """ 5x2 cross validation on given methods on specified dataset. Parameters ---------- models : list List of tuples with [('<name>', <instance of CalibrationMethod>), ...]. datafile : str Path to datafile which contains two NumPy arrays with keys 'ground_truth' and 'predictions'. bins : int Number of bins used by ECE, MCE and ReliabilityDiagram. save_models : bool, optional, default: False True if instances of calibration methods should be stored. Returns ------- int 0 on success, -1 otherwise """ if not os.path.exists(datafile): print("Dataset \'%s\' does not exist" % datafile) return -1 # read NumPy input files try: with open(datafile, "rb") as open_file: npzfile = np.load(open_file) ground_truth = npzfile['ground_truth'].squeeze() predictions = npzfile['predictions'].squeeze() except KeyError: print( "Key \'ground_truth\' or \'predictions\' not found in file \'%s\'" % datafile) return -1 if len(predictions.shape) == 2: n_classes = predictions.shape[1] else: n_classes = 2 # initialize error metrics ace = ACE(bins) ece = ECE(bins) mce = MCE(bins) all_accuracy = [] all_ace = [] all_ece = [] all_mce = [] for i in range(5): # split data set into build set and validation set build_set_gt, validation_set_gt, build_set_sm, validation_set_sm = train_test_split( ground_truth, predictions, test_size=0.5, stratify=ground_truth) for _ in range(2): # 5x2 cross validation - flip build/val set after each iteration build_set_gt, validation_set_gt = validation_set_gt, build_set_gt build_set_sm, validation_set_sm = validation_set_sm, build_set_sm # lists for error metrics for current iteration (it) it_all_accuracy = [] it_all_ace = [] it_all_ece = [] it_all_mce = [] if n_classes > 2: labels = np.argmax(validation_set_sm, axis=1) else: labels = np.where(validation_set_sm > 0.5, np.ones_like(validation_set_gt), np.zeros_like(validation_set_gt)) accuracy = np.mean( np.where(labels == validation_set_gt, np.ones_like(labels), np.zeros_like(labels))) it_all_accuracy.append(accuracy) it_all_ace.append(ace.measure(validation_set_sm, validation_set_gt)) it_all_ece.append(ece.measure(validation_set_sm, validation_set_gt)) it_all_mce.append(mce.measure(validation_set_sm, validation_set_gt)) # ------------------------------------------ # build and save models for model in models: name, instance = model print("Build %s model" % name) instance.fit(build_set_sm, build_set_gt) if save_models: instance.save_model("./models/%s_run_%d.pkl" % (name, i)) # ------------------------------------------ # perform predictions for model in models: _, instance = model prediction = instance.transform(validation_set_sm) if n_classes > 2: labels = np.argmax(prediction, axis=1) else: labels = np.where(prediction > 0.5, np.ones_like(validation_set_gt), np.zeros_like(validation_set_gt)) accuracy = np.mean( np.where(labels == validation_set_gt, np.ones_like(labels), np.zeros_like(labels))) it_all_accuracy.append(accuracy) it_all_ace.append(ace.measure(prediction, validation_set_gt)) it_all_ece.append(ece.measure(prediction, validation_set_gt)) it_all_mce.append(mce.measure(prediction, validation_set_gt)) # append lists of current iterations all_accuracy.append(it_all_accuracy) all_ace.append(it_all_ace) all_ece.append(it_all_ece) all_mce.append(it_all_mce) # convert to NumPy arrays and reduce mean afterwards all_accuracy = np.array(all_accuracy) all_ace = np.array(all_ace) all_ece = np.array(all_ece) all_mce = np.array(all_mce) all_accuracy = np.mean(all_accuracy, axis=0) all_ace = np.mean(all_ace, axis=0) all_ece = np.mean(all_ece, axis=0) all_mce = np.mean(all_mce, axis=0) names = [len(x[0]) for x in models] buffer = max(names) # --------------------------------------------------------- # output formatted ECE fill = (buffer - len("Default")) * " " print( "%s%s Accuracy: %.5f - ACE: %.5f - ECE: %.5f - MCE: %.5f" % ("Default", fill, all_accuracy[0], all_ace[0], all_ece[0], all_mce[0])) # --------------------------------------------------------- for i, model in enumerate(models, start=1): name, instance = model fill = (buffer - len(name)) * " " print( "%s%s Accuracy: %.5f - ACE: %.5f - ECE: %.5f - MCE: %.5f" % (name, fill, all_accuracy[i], all_ace[i], all_ece[i], all_mce[i])) return 0
def evaluate( annotations, results, iou=0.75, iou_type="segm", dataset="lvis", n_bins=10, commercial_only=False, subset=1.0, seed=0.0, min_score=0.0, vis_dir=None, vis_per_class=False, max_dets=300, max_dets_per_class=-1, ): """ Args: annotations (str, Path, or dict): Path to COCO/LVIS-style annotations, or dict containing the annotations. results (str, Path, or dict): Path to COCO/LVIS-style results, or dict containing the results. iou (float): IoU threshold to evaluate calibration at. iou_type (str): segm or bbox dataset (str): lvis or coco n_bins (int): Number of bins for calibration eval commercial_only (bool): Use only commercial images for COCO. Used to match Küppers et al. setting. subset (float): If <1.0, use a random subset of this portion for eval. seed (float): Used to seed the rng for subset selection. min_score (float): If specified, ignore detections below this threshold for calibration evaluation. This flag does not affect the AP calculation. This should generally be left at 0, but can be set to 0.3 to match the Küppers et al. setting. vis_dir (str, Path, or None): If specified, output reliability diagrams to this directory. vis_per_class (bool): If vis_dir is specified and vis_per_class is True, output a reliability diagram for each class. max_dets (int): Limit number of detections per image. max_dets_per_class (int): Limit number of detections per class. """ if vis_dir is not None: vis_dir = Path(vis_dir) plotter = ReliabilityDiagram(bins=n_bins, detection=True, metric="ECE") else: plotter = None rng = random.Random(seed) eval_wrapper = EvalWrapper( annotations, results, dataset_type=dataset, ious=[iou], iou_type=iou_type, max_dets=max_dets, max_dets_per_class=max_dets_per_class, ) eval_obj = eval_wrapper.construct_eval(use_cats=True) is_lvis = eval_wrapper.is_lvis() params = eval_obj.params gt = eval_obj.lvis_gt if is_lvis else eval_obj.cocoGt if commercial_only: # Licenses 1, 2, 3 are NonCommercial valid_licenses = {4, 5, 6, 7, 8} orig_img_ids = params.img_ids if is_lvis else params.imgIds img_ids = [ i for i in orig_img_ids if gt.imgs[i]["license"] in valid_licenses ] logging.info( f"Selecting {len(img_ids)}/{len(orig_img_ids)} commercial images.") if is_lvis: params.img_ids = img_ids else: params.imgIds = img_ids if subset < 1.0: img_ids = params.img_ids if is_lvis else params.imgIds k = int(round(len(img_ids) * subset)) logging.info(f"Selecting {k}/{len(img_ids)} images randomly.") rng.shuffle(img_ids) if is_lvis: params.img_ids = img_ids[:k] else: params.imgIds = img_ids[:k] eval_obj.evaluate() # True positive set true_positives, false_positives, missed_gt = load_tp_fp_fn(eval_obj) eval_obj.accumulate() eval_obj.summarize() # Map class id to list of (detection: dict, is_matched: bool) class_dets = defaultdict(list) for dt_id in true_positives: ann = eval_wrapper.results.anns[dt_id] class_dets[ann["category_id"]].append((ann, True)) for dt_id in false_positives: ann = eval_wrapper.results.anns[dt_id] class_dets[ann["category_id"]].append((ann, False)) if min_score > 0.0: class_dets = { c: [x for x in dets if x[0]["score"] > min_score] for c, dets in class_dets.items() } # Remove empty classes. class_dets = {c: v for c, v in class_dets.items() if v} # Map class id to tuple of (scores, is_matched) scores_matched = { c: ( np.array([d["score"] for d, _ in dets])[:, np.newaxis], # scores, (n, 1) np.array([m for _, m in dets])[:, np.newaxis], # is_matched, (n, 1) ) for c, dets in class_dets.items() } classes = sorted(scores_matched.keys()) all_scores = np.vstack([scores_matched[c][0] for c in classes]) all_is_matched = np.vstack([scores_matched[c][1] for c in classes]) ece = ECE([n_bins], detection=True) output_metrics = {} output_metrics["AP"] = eval_obj.results["AP"] if is_lvis: for f in ("f", "c", "r"): output_metrics[f"AP{f}"] = eval_obj.results[f"AP{f}"] output_metrics["ece-overall"] = ece.measure(all_scores, all_is_matched) if plotter: fig = plotter.plot(all_scores, all_is_matched, filename=vis_dir / f"overall.pdf") plt.close(fig) # NOTE: Skips classes with no predictions nor groundtruth; Assigns ECE of 1.0 for # classes with groundtruth but no predictions. per_class_eces = {} predicted_classes = set(scores_matched.keys()) missed_classes = {gt.anns[g]["category_id"] for g in missed_gt} for cid in missed_classes | predicted_classes: if cid not in predicted_classes: # Present but not predicted # Skip class from calibration error. continue else: scores, is_matched = scores_matched[cid] per_class_eces[cid] = ece.measure(scores, is_matched) if plotter and vis_per_class: cname = gt.cats[cid].get("synset", gt.cats[cid]["name"]) fig = plotter.plot(scores, is_matched, filename=vis_dir / f"class-{cid}-{cname}.pdf") plt.close(fig) output_metrics["ece-per-class"] = np.mean(list(per_class_eces.values())) if eval_wrapper.is_lvis(): # Map frequency to category ids (eval_obj.freq_groups maps to indices) for f, indices in enumerate(eval_obj.freq_groups): freq = eval_obj.params.img_count_lbl[f] cat_ids = [eval_obj.params.cat_ids[i] for i in indices] cat_ids = [c for c in cat_ids if c in scores_matched] freq_scores = np.vstack([scores_matched[c][0] for c in cat_ids]) freq_matched = np.vstack([scores_matched[c][1] for c in cat_ids]) output_metrics[f"ece-freq-{freq}"] = ece.measure( freq_scores, freq_matched) output_metrics[f"ece-per-class-{freq}"] = np.mean( [per_class_eces[c] for c in cat_ids if c in per_class_eces]) if plotter: fig = plotter.plot(freq_scores, freq_matched, filename=vis_dir / f"freq-{freq}.pdf") plt.close(fig) return output_metrics
def get_model_diagnosis(df, strategy='quantile', rps_col_prefix='model', add_baseline=False): """ Diagnosis Plots: Accepts a DataFrame containing columns: ordinal_result_1 ordinsl_result_2 ordinal_result_3 1 2 3 The columns are paired as follows: "ordinal_result_1" represents a binary column defining whether a home win event occurred, and column named "1" contains the corresponding model probabilities Same for ordinal_result_2, and 2 and ordinal_result_3 and 3 strategy{‘uniform’, ‘quantile’}, (default=’uniform’) Strategy used to define the widths of the bins. uniform All bins have identical widths. quantile All bins have the same number of points. RPS Plots: Accepts a DataFrame containing columns: ordinal_result "rps_col_prefix"_rps and optional columns named rps_baseline_1 2 3 The columns are paired as follows: "ordinal_result_1" represents a binary column defining whether a home win event occurred, and column named "1" contains the corresponding model probabilities Same for ordinal_result_2, and 2 and ordinal_result_3 and 3 """ fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(12, 10)) ax1, ax2, ax3 = axes[:, 0] n_bins = 10 mapper = {1: 'Home Win', 2: 'Draw', 3: 'Away Win'} for col, ax in zip([1, 2, 3], (ax1, ax2, ax3)): fop, mpv = calibration_curve(df['ordinal_result_' + str(col)], df[col], n_bins=n_bins, strategy=strategy) # plot perfectly calibrated ax.plot([0, 1], [0, 1], linestyle='--') # plot model reliability ax.plot(mpv, fop, marker='.') ax.set_title(mapper[col]) ax4, ax5, ax6 = axes[:, 1] n_bins = 10 mapper = {1: 'Home Win RPS', 2: 'Draw RPS', 3: 'Away Win RPS'} for col, ax in zip([1, 2, 3], (ax4, ax5, ax6)): rpss = df[df['ordinal_result'] == col][rps_col_prefix + '_rps'] ax.hist(rpss, bins=n_bins) ax.set_xlim(0, 1.0) baseline_col_name = 'rps_baseline_' + str(col) if add_baseline and baseline_col_name in df.columns: ax.axvline(df['rps_baseline_1'].unique(), color='r') median = rpss.median() ax.axvline(median, color='r', linestyle='dashed', label=f'Median: {median:.3f}') ax.set_title(mapper[col]) ax.legend() ax.grid() pred_arr, act_arr = df[[1, 2, 3]].values, df['ordinal_result'].values ace = ACE(bins=n_bins) ace_val = ace.measure(pred_arr, act_arr) ece = ECE(bins=n_bins) ece_val = ece.measure(pred_arr, act_arr) mce = MCE(bins=n_bins) mce_val = mce.measure(pred_arr, act_arr) print( f'Average Calibration Error: {ace_val:.3f}\nExpected Calibration Error: {ece_val:.3f}\nMaximum Calibration Error: {mce_val:.3f}' ) print(f"Number of Instances: {len(df)}") return fig, (ax1, ax2, ax3, ax4, ax5, ax6)