def _derive_coco_results(self, coco_eval, iou_type): """ Derive the desired score numbers from summarized COCOeval. Args: coco_eval (None or COCOEval): None represents no predictions from model. iou_type (str): class_names (None or list[str]): if provided, will use it to predict per-category AP. Returns: a dict of {metric name: score} """ metrics = ["AP", "mMR", "Recall"] if coco_eval is None: self._logger.warn( "No predictions from the model! Set scores to -1") return {metric: -1 for metric in metrics} # the standard metrics results = { metric: coco_eval[idx] for idx, metric in enumerate(metrics) } small_table = create_small_table(results) self._logger.info("Evaluation results for {}: \n".format(iou_type) + small_table) # if class_names is None or len(class_names) <= 1: return results
def _eval_classification_accuracy(self): """ Evaluate self._predictions on the classification task. Fill self._results with the metrics of the tasks. """ batch_size = len(self._targets) pred = torch.cat(self._predictions, dim=1) target = torch.cat(self._targets) correct = pred.eq(target.view(1, -1).expand_as(pred)) results = {} for k in self._topk: correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) results[f"Top_{k} Acc"] = correct_k.mul_(100.0 / batch_size).item() self._results["Accuracy"] = results small_table = create_small_table(results) self._logger.info("Evaluation results for classification: \n" + small_table) if self._dump: dump_info_one_task = { "task": "classification", "tables": [small_table], } self._dump_infos.append(dump_info_one_task)
def _derive_lvis_results(self, lvis_eval, iou_type, summary): """ Derive the desired score numbers from summarized LVISEval. Args: lvis_eval (None or LVISEval): None represents no predictions from model. iou_type (str): specific evaluation task, optional values are: "bbox", "segm". Returns: a dict of {metric name: score} """ metrics = { "bbox": ["AP", "AP50", "AP75", "APs", "APm", "APl", "APr", "APc", "APf"], "segm": ["AP", "AP50", "AP75", "APs", "APm", "APl", "APr", "APc", "APf"], }[iou_type] if lvis_eval is None: logger.warning("No predictions from the model!") return {metric: float("nan") for metric in metrics} # Pull the standard metrics from the LVIS results results = lvis_eval.get_results() results = {metric: float(results[metric] * 100) for metric in metrics} small_table = create_small_table(results) logger.info("Evaluation results for {}: \n".format(iou_type) + small_table) if self._dump: dump_info_one_task = { "task": iou_type, "summary": summary.getvalue(), "tables": [small_table], } self._dump_infos.append(dump_info_one_task) return results
def evaluate(self): """ Returns: dict: has a key "segm", whose value is a dict of "AP", "AP50", and "AP75". """ all_predictions = comm.gather(self._predictions, dst=0) if not comm.is_main_process(): return predictions = defaultdict(list) for predictions_per_rank in all_predictions: for clsid, lines in predictions_per_rank.items(): predictions[clsid].extend(lines) del all_predictions self._logger.info( "Evaluating {} using {} metric. " "Note that results do not use the official Matlab API.".format( self._dataset_name, 2007 if self._is_2007 else 2012)) with tempfile.TemporaryDirectory(prefix="pascal_voc_eval_") as dirname: res_file_template = os.path.join(dirname, "{}.txt") aps = defaultdict(list) # iou -> ap per class for cls_id, cls_name in enumerate(self._class_names): lines = predictions.get(cls_id, [""]) with open(res_file_template.format(cls_name), "w") as f: f.write("\n".join(lines)) for thresh in range(50, 100, 5): rec, prec, ap = voc_eval( res_file_template, self._anno_file_template, self._image_set_path, cls_name, ovthresh=thresh / 100.0, use_07_metric=self._is_2007, ) aps[thresh].append(ap * 100) ret = OrderedDict() mAP = {iou: np.mean(x) for iou, x in aps.items()} ret["bbox"] = { "AP": np.mean(list(mAP.values())), "AP50": mAP[50], "AP75": mAP[75] } small_table = create_small_table(ret["bbox"]) self._logger.info("Evaluation results for bbox: \n" + small_table) if self._dump: dump_info_one_task = { "task": "bbox", "tables": [small_table], } _dump_to_markdown([dump_info_one_task]) return ret
def dump(self, results): self._logger.info("Dump metric to {}".format(self._output_file)) small_table = create_small_table(results) self._logger.info("Evaulation results for mse:\n" + small_table) with open(self._output_file, "w") as f: f.write("MSE Evaluator:\n" + small_table) f.write("\n\n") for k, v in results.items(): f.write(str(k) + "\t\t" + str(v) + "\n")
def _eval_box_proposals(self): """ Evaluate the box proposals in self._predictions. Fill self._results with the metrics for "box_proposals" task. """ if self._output_dir: # Saving generated box proposals to file. # Predicted box_proposals are in XYXY_ABS mode. bbox_mode = BoxMode.XYXY_ABS.value ids, boxes, objectness_logits = [], [], [] for prediction in self._predictions: ids.append(prediction["image_id"]) boxes.append( prediction["proposals"].proposal_boxes.tensor.numpy()) objectness_logits.append( prediction["proposals"].objectness_logits.numpy()) proposal_data = { "boxes": boxes, "objectness_logits": objectness_logits, "ids": ids, "bbox_mode": bbox_mode, } with megfile.smart_open( os.path.join(self._output_dir, "box_proposals.pkl"), "wb") as f: pickle.dump(proposal_data, f) if not self._do_evaluation: logger.info("Annotations are not available for evaluation.") return logger.info("Evaluating bbox proposals ...") res = {} areas = {"all": "", "small": "s", "medium": "m", "large": "l"} for limit in [100, 1000]: for area, suffix in areas.items(): stats = _evaluate_box_proposals(self._predictions, self._coco_api, area=area, limit=limit) key = "AR{}@{:d}".format(suffix, limit) res[key] = float(stats["ar"].item() * 100) logger.info("Proposal metrics: \n" + create_small_table(res)) self._results["box_proposals"] = res
def _eval_classification_accuracy(self): """ Evaluate self._predictions on the classification task. Fill self._results with the metrics of the tasks. """ batch_size = len(self._targets) pred = torch.cat(self._predictions, dim=1) target = torch.cat(self._targets) correct = pred.eq(target.reshape(1, -1).expand_as(pred)) results = {} macro_f1_score = f1_score(target.detach().cpu().numpy(), pred[0].detach().cpu().numpy(), average='macro') results["Macro_F1"] = macro_f1_score # Update with accuracy of the sub-group sub_group_accuracy = self._eval_longtail_subgroup_accuracy( pred, target) keys = ['Many', 'Medium', 'Few'] for iidx, key in enumerate(self._topk): correct_k = correct[:key].reshape(-1).float().sum(0, keepdim=True) results[f"Top_{key} Acc"] = correct_k.mul_(100.0 / batch_size).item() for idx, subgroup in enumerate(keys): results[f'Top_{key} {subgroup} Acc'] = sub_group_accuracy[idx][ iidx] self._results["Accuracy"] = results small_table = create_small_table(results) logger.info("Evaluation results for classification: \n" + small_table) if self._dump: dump_info_one_task = { "task": "classification", "tables": [small_table], "dataset": self.dataset_name, } self._dump_infos.append(dump_info_one_task)
def _derive_coco_results(self, coco_eval, iou_type): """ Derive the desired score numbers from summarized COCOeval. Args: coco_eval (None or COCOEval): None represents no predictions from model. iou_type (str): class_names (None or list[str]): if provided, will use it to predict per-category AP. Returns: a dict of {metric name: score} """ metrics = [ "Reasonable", "Reasonable_small", "Reasonable_occ=heavy", "All" ] if coco_eval is None: logger.warning("No predictions from the model! Set scores to -1") return {metric: -1 for metric in metrics} # the standard metrics results = { metric: coco_eval[idx] for idx, metric in enumerate(metrics) } small_table = create_small_table(results) logger.info("Evaluation results for {}: \n".format(iou_type) + small_table) if self._dump: dump_info_one_task = { "task": iou_type, "tables": [small_table], } self._dump_infos.append(dump_info_one_task) # if class_names is None or len(class_names) <= 1: return results
def _derive_coco_results(self, coco_eval, iou_type, summary, class_names=None): """ Derive the desired score numbers from summarized COCOeval. Args: coco_eval (None or COCOEval): None represents no predictions from model. iou_type (str): specific evaluation task, optional values are: "bbox", "segm", "keypoints". class_names (None or list[str]): if provided, will use it to predict per-category AP. Returns: a dict of {metric name: score} """ metrics = { "bbox": ["AP", "AP50", "AP75", "APs", "APm", "APl"], "segm": ["AP", "AP50", "AP75", "APs", "APm", "APl"], "keypoints": ["AP", "AP50", "AP75", "APm", "APl"], }[iou_type] if coco_eval is None: logger.warning("No predictions from the model!") return {metric: float("nan") for metric in metrics} # the standard metrics results = { metric: float(coco_eval.stats[idx] * 100 if coco_eval.stats[idx] >= 0 else "nan") for idx, metric in enumerate(metrics) } small_table = create_small_table(results) logger.info("Evaluation results for {}: \n".format(iou_type) + small_table) if not np.isfinite(sum(results.values())): logger.info("Note that some metrics cannot be computed.") if class_names is None: # or len(class_names) <= 1: return results # Compute per-category AP # from https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L222-L252 # noqa precisions = coco_eval.eval["precision"] # precision has dims (iou, recall, cls, area range, max dets) assert len(class_names) == precisions.shape[2] results_per_category = {} for idx, name in enumerate(class_names): # area range index 0: all area ranges # max dets index -1: typically 100 per image precision = precisions[:, :, idx, 0, -1] precision = precision[precision > -1] ap = np.mean(precision) if precision.size else float("nan") results_per_category[name] = float(ap * 100) # results_per_category.append(("{}".format(name), float(ap * 100))) # tabulate it table = create_table_with_header(results_per_category, headers=["category", "AP"]) logger.info("Per-category {} AP: \n".format(iou_type) + table) results.update( {"AP-" + name: ap for name, ap in results_per_category.items()}) if self._dump: dump_info_one_task = { "task": iou_type, "summary": summary.getvalue(), "tables": [small_table, table], } self._dump_infos.append(dump_info_one_task) return results
def evaluate(self): """ Returns: dict: has a key "segm", whose value is a dict of "AP" and "AP50". """ comm.synchronize() if comm.get_rank() > 0: return os.environ["CITYSCAPES_DATASET"] = os.path.abspath( os.path.join(self._metadata.gt_dir, "..", "..") ) # Load the Cityscapes eval script *after* setting the required env var, # since the script reads CITYSCAPES_DATASET into global variables at load time. import cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling as cityscapes_eval self._logger.info("Evaluating results under {} ...".format(self._temp_dir)) # set some global states in cityscapes evaluation API, before evaluating cityscapes_eval.args.predictionPath = os.path.abspath(self._temp_dir) cityscapes_eval.args.predictionWalk = None cityscapes_eval.args.JSONOutput = False cityscapes_eval.args.colorized = False cityscapes_eval.args.gtInstancesFile = os.path.join(self._temp_dir, "gtInstances.json") # These lines are adopted from # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalInstanceLevelSemanticLabeling.py # noqa groundTruthImgList = glob.glob(cityscapes_eval.args.groundTruthSearch) assert len( groundTruthImgList ), "Cannot find any ground truth images to use for evaluation. Searched for: {}".format( cityscapes_eval.args.groundTruthSearch ) predictionImgList = [] for gt in groundTruthImgList: predictionImgList.append(cityscapes_eval.getPrediction(gt, cityscapes_eval.args)) results = cityscapes_eval.evaluateImgLists( predictionImgList, groundTruthImgList, cityscapes_eval.args )["averages"] ret = OrderedDict() ret["segm"] = {"AP": results["allAp"] * 100, "AP50": results["allAp50%"] * 100} self._working_dir.cleanup() small_table = create_small_table(ret["segm"]) self._logger.info("Evaluation results for segm: \n" + small_table) results_per_category = [] for cat, ap in results["classes"].items(): ap = [ap_i * 100 for ap_i in ap.values()] results_per_category.append([cat, *ap]) table = tabulate( results_per_category, headers=["category", "AP", "AP50"], tablefmt="pipe", floatfmt=".3f", numalign="left" ) self._logger.info("Per-category segm AP: \n" + table) if self._dump: dump_info_one_task = { "task": "segm", "tables": [small_table, table], } _dump_to_markdown([dump_info_one_task]) return ret
def evaluate(self): """ Evaluates standard semantic segmentation metrics (http://cocodataset.org/#stuff-eval): * Mean intersection-over-union averaged across classes (mIoU) * Frequency Weighted IoU (fwIoU) * Mean pixel accuracy averaged across classes (mACC) * Pixel Accuracy (pACC) """ if self._distributed: comm.synchronize() conf_matrix_list = comm.all_gather(self._conf_matrix) self._predictions = comm.all_gather(self._predictions) self._predictions = list(itertools.chain(*self._predictions)) if not comm.is_main_process(): return self._conf_matrix = np.zeros_like(self._conf_matrix) for conf_matrix in conf_matrix_list: self._conf_matrix += conf_matrix if self._output_dir: PathManager.mkdirs(self._output_dir) file_path = os.path.join(self._output_dir, "sem_seg_predictions.json") with PathManager.open(file_path, "w") as f: f.write(json.dumps(self._predictions)) acc = np.zeros(self._num_classes, dtype=np.float) iou = np.zeros(self._num_classes, dtype=np.float) tp = self._conf_matrix.diagonal()[:-1].astype(np.float) pos_gt = np.sum(self._conf_matrix[:-1, :-1], axis=0).astype(np.float) class_weights = pos_gt / np.sum(pos_gt) pos_pred = np.sum(self._conf_matrix[:-1, :-1], axis=1).astype(np.float) acc_valid = pos_gt > 0 acc[acc_valid] = tp[acc_valid] / pos_gt[acc_valid] iou_valid = (pos_gt + pos_pred) > 0 union = pos_gt + pos_pred - tp iou[acc_valid] = tp[acc_valid] / union[acc_valid] macc = np.sum(acc) / np.sum(acc_valid) miou = np.sum(iou) / np.sum(iou_valid) fiou = np.sum(iou * class_weights) pacc = np.sum(tp) / np.sum(pos_gt) res = {} res["mIoU"] = 100 * miou res["fwIoU"] = 100 * fiou res["mACC"] = 100 * macc res["pACC"] = 100 * pacc if self._output_dir: file_path = os.path.join(self._output_dir, "sem_seg_evaluation.pth") with PathManager.open(file_path, "wb") as f: torch.save(res, f) results = OrderedDict({"sem_seg": res}) small_table = create_small_table(res) self._logger.info("Evaluation results for sem_seg: \n" + small_table) if self._dump: dump_info_one_task = { "task": "sem_seg", "tables": [small_table], } _dump_to_markdown([dump_info_one_task]) return results