def test_greedy_assignment(self): gt_big_mask = self.square_mask.copy() gt_small_mask = np.zeros(gt_big_mask.shape, dtype=gt_big_mask.dtype) gt_small_mask[900:1101, 900:1101] = True gt_big = GroundTruthInstance(gt_big_mask, 0) gt_small = GroundTruthInstance(gt_small_mask, 0) gts = [gt_big, gt_small] # Big Det det1 = PBoxDetInst( self.square_label_list, [700, 700, 1300, 1300], [[[10000, 0], [0, 10000]], [[10000, 0], [0, 10000]]]) # Small Det det2 = PBoxDetInst( self.square_label_list, [800, 800, 1200, 1200], [[[10000, 0], [0, 10000]], [[10000, 0], [0, 10000]]]) dets = [det1, det2] evaluator = PDQ(greedy_mode=True) evaluator.score([(gts, dets)]) det_evals = evaluator._det_evals for img_det_evals in det_evals: for det_eval in img_det_evals: # Assume that big detection should be matched to small gt and small det to big gt self.assertNotEqual(det_eval['det_id'], det_eval['gt_id'])
def test_detection_no_gt(self): detections = [BBoxDetInst(self.default_label_list, self.gt_box)] gts = [] evaluator = PDQ() score = evaluator.score([(gts, detections)]) self.assertEqual(score, 0)
def test_half_position_confidence(self): detections = [ BBoxDetInst(self.square_label_list, self.square_gt_box, 0.5) ] evaluator = PDQ() score = evaluator.score([(self.square_gt, detections)]) self.assertAlmostEqual(np.sqrt(0.5), score, 4)
def test_correct_second_detection(self): gts = [val for val in self.square_gt] detections = [BBoxDetInst(self.default_label_list, [0, 0, 10, 10]), BBoxDetInst(self.default_label_list, self.gt_box)] evaluator = PDQ() score = evaluator.score([(gts, detections)]) self.assertAlmostEqual(score, 0.5)
def test_multiple_detections(self): ten_detections = [ BBoxDetInst(self.square_label_list, self.square_gt_box) for _ in range(10) ] evaluator = PDQ() score = evaluator.score([(self.square_gt, ten_detections)]) self.assertAlmostEqual(score, 0.1)
def test_no_detections_for_image(self): gts1 = [val for val in self.square_gt] gts2 = [GroundTruthInstance(self.square_mask, 0, 1, 1)] dets1 = [BBoxDetInst(self.default_label_list, self.gt_box)] dets2 = [] evaluator = PDQ() score = evaluator.score([(gts1, dets1), (gts2, dets2)]) self.assertAlmostEqual(score, 0.5)
def test_run_score_multiple_times_on_single_pdq_instance(self): two_detections = [BBoxDetInst(self.default_label_list, self.gt_box), BBoxDetInst(self.default_label_list, self.gt_box)] one_detection = [BBoxDetInst(self.default_label_list, self.gt_box)] evaluator = PDQ() score_two = evaluator.score([(self.square_gt, two_detections)]) score_one = evaluator.score([(self.square_gt, one_detection)]) self.assertAlmostEqual(score_two, 0.5) self.assertAlmostEqual(score_one, 1.0)
def test_no_detections_for_image_with_too_small_gt(self): gts1 = [val for val in self.square_gt] small_mask = np.zeros(self.img_size, dtype=np.bool) small_mask[500:504, 500:501] = True gts2 = [GroundTruthInstance(small_mask, 0, 1, 1)] dets1 = [BBoxDetInst(self.default_label_list, self.gt_box)] dets2 = [] evaluator = PDQ() score = evaluator.score([(gts1, dets1), (gts2, dets2)]) self.assertAlmostEqual(score, 1.0)
def test_detect_500_extra_pixels(self): det_box = [val for val in self.gt_box] det_box[2] += 1 detections = [BBoxDetInst(self.default_label_list, det_box)] evaluator = PDQ() score = evaluator.score([(self.square_gt, detections)]) expected_spatial_quality = np.exp((_MAX_LOSS*500)/(500*500)) expected_gmean = np.sqrt(expected_spatial_quality) self.assertAlmostEqual(expected_gmean, score, 4)
def main(method, n_classes): if (method == 1): print("Extracting GT and Detections") param_sequence, len_sequences = gen_param_sequence() print("Calculating PDQ") # Get summary statistics (PDQ, avg_qualities) evaluator = PDQ(filter_gts=True, segment_mode=False, greedy_mode=False) pdq = evaluator.score(param_sequence) TP, FP, FN = evaluator.get_assignment_counts() avg_spatial_quality = evaluator.get_avg_spatial_score() avg_label_quality = evaluator.get_avg_label_score() avg_overall_quality = evaluator.get_avg_overall_quality_score() avg_fg_quality = evaluator.get_avg_fg_quality_score() avg_bg_quality = evaluator.get_avg_bg_quality_score() # Get the detection-wise and ground-truth-wise qualities and matches for PDQ and save them to file all_gt_eval_dicts = evaluator._gt_evals all_det_eval_dicts = evaluator._det_evals result = { "PDQ": pdq, "avg_pPDQ": avg_overall_quality, "avg_spatial": avg_spatial_quality, 'avg_fg': avg_fg_quality, 'avg_bg': avg_bg_quality, "avg_label": avg_label_quality, "TP": TP, "FP": FP, "FN": FN } return result #Calculate mAP if (method == 0): print("Calculating mAP") #print("Extracting GT and Detections") param_sequence, len_sequences = gen_param_sequence() mAP = coco_mAP(param_sequence, n_classes, use_heatmap=False) #print("MAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAP ", mAP) return mAP # Compile evaluation statistics into a single dictionary result = { "PDQ": pdq, "avg_pPDQ": avg_overall_quality, "avg_spatial": avg_spatial_quality, 'avg_fg': avg_fg_quality, 'avg_bg': avg_bg_quality, "avg_label": avg_label_quality, "TP": TP, "FP": FP, "FN": FN, 'mAP': mAP }
def test_multiple_missed_gts_too_small(self): gts = [val for val in self.square_gt] for i in range(9): # Create small 2x2 boxes which are missed around an edge of the image (buffer of 2 pixels) new_gt_mask = np.zeros(gts[0].segmentation_mask.shape, gts[0].segmentation_mask.dtype) new_gt_mask[2:4, 2 + i * 4:4 + i * 4] = np.amax(gts[0].segmentation_mask) gts.append(GroundTruthInstance(new_gt_mask, 0, 0, i+1)) detections = [BBoxDetInst(self.default_label_list, self.gt_box)] evaluator = PDQ() score = evaluator.score([(gts, detections)]) self.assertAlmostEqual(score, 1.0)
def test_cross_gt_detected_by_perfect_box_in_segment_mode(self): detections = [BBoxDetInst(self.cross_label_list, self.cross_gt_box)] evaluator = PDQ(segment_mode=True) score = evaluator.score([(self.cross_gt, detections)]) expected_num_missed_pixels = np.sum( np.logical_xor(self.square_mask, self.cross_mask)) expected_spatial_quality = np.exp( (_MAX_LOSS * expected_num_missed_pixels) / np.sum(self.cross_mask)) expected_gmean = np.sqrt(expected_spatial_quality) self.assertAlmostEqual(score, expected_gmean, 4)
def test_no_detections_for_image_with_small_and_big_gt_with_filtering( self): gts1 = [val for val in self.square_gt] small_mask = np.zeros(self.img_size, dtype=np.bool) small_mask[500:504, 500:501] = True gts2 = [ GroundTruthInstance(self.square_mask, 0), GroundTruthInstance(small_mask, 0) ] dets1 = [BBoxDetInst(self.square_label_list, self.square_gt_box)] dets2 = [] evaluator = PDQ(filter_gts=True) score = evaluator.score([(gts1, dets1), (gts2, dets2)]) self.assertAlmostEqual(score, 0.5)
def test_no_overlap_box(self): det_box = [val for val in self.gt_box] box_width = (self.gt_box[2]+1) - self.gt_box[0] det_box[0] += box_width det_box[2] += box_width detections = [BBoxDetInst(self.default_label_list, det_box)] evaluator = PDQ() score = evaluator.score([(self.square_gt, detections)]) expected_spatial_quality = 0 expected_gmean = np.sqrt(expected_spatial_quality) self.assertAlmostEqual(expected_gmean, score, 4)
def do_evaluation(submission_dir, ground_truth_dir): """ Evaluate a particular image sequence :param submission_dir: :param ground_truth_dir: :return: """ ground_truth = gt_loader.read_ground_truth(ground_truth_dir) detections = submission_loader.read_submission(submission_dir, expected_sequence_names=set( ground_truth.keys())) matches = gt_loader.match_sequences(ground_truth, detections) evaluator = PDQ() score = evaluator.score(matches) TP, FP, FN = evaluator.get_assignment_counts() avg_spatial_quality = evaluator.get_avg_spatial_score() avg_label_quality = evaluator.get_avg_label_score() avg_overall_quality = evaluator.get_avg_overall_quality_score() return { 'score': score * 100, 'avg_spatial': avg_spatial_quality, 'avg_label': avg_label_quality, 'avg_pPDQ': avg_overall_quality, 'TPs': TP, 'FPs': FP, 'FNs': FN }
def test_missed_gts_and_unmatched_detections(self): gts = [val for val in self.square_gt] for i in range(10): # Create small 11x11 boxes which are missed around an edge of the image (buffer of 2 pixels) new_gt_mask = np.zeros(gts[0].segmentation_mask.shape, gts[0].segmentation_mask.dtype) new_gt_mask[2:14, 2 + i * 14:14 + i * 14] = np.amax( gts[0].segmentation_mask) gts.append(GroundTruthInstance(new_gt_mask, 0)) detections = [ BBoxDetInst(self.square_label_list, self.square_gt_box) for _ in range(10) ] evaluator = PDQ() score = evaluator.score([(gts, detections)]) self.assertAlmostEqual(score, 1 / 20.)
def do_evaluation(submission_dir, ground_truth_dir, sequences=None, num_frames=-1, start_frame=0): """ Evaluate a particular image sequence :param submission_dir: location of the detections .json files (one for each sequence) :param ground_truth_dir: location of the ground-truth folders (one for each sequence). Each ground-truth folder must contain mask images (.png format) and a matching labels.json file. :param sequences: A whitelist of sequence ids to include, as integers :param num_frames: The number of frames to read from each sequence, default is all available. :param start_frame: The index of the first frame to read :return: Dictionary containing summary of all metrics used in competition leaderboard (score, average spatial quality, average label quality, average overall quality (avg_pPDQ), true positives, false positives, and false negatives) """ ground_truth = gt_loader.read_ground_truth(ground_truth_dir, sequences, start_index=start_frame, end_index=start_frame + num_frames) detections = submission_loader.read_submission( submission_dir, expected_sequence_names=set(ground_truth.keys()), start_index=start_frame, end_index=start_frame + num_frames) matches = gt_loader.match_sequences(ground_truth, detections) evaluator = PDQ() score = evaluator.score(matches) TP, FP, FN = evaluator.get_assignment_counts() avg_spatial_quality = evaluator.get_avg_spatial_score() avg_label_quality = evaluator.get_avg_label_score() avg_overall_quality = evaluator.get_avg_overall_quality_score() avg_fp_quality = evaluator.get_avg_fp_score() return { 'score': score * 100, 'avg_spatial': avg_spatial_quality, 'avg_label': avg_label_quality, 'avg_pPDQ': avg_overall_quality, 'avg_fp_quality': avg_fp_quality, 'TPs': TP, 'FPs': FP, 'FNs': FN }
def main(): if not os.path.isdir(args.save_folder): os.makedirs(args.save_folder) print("Extracting GT and Detections") param_sequence, len_sequences = gen_param_sequence() print("Calculating PDQ") # Get summary statistics (PDQ, avg_qualities) evaluator = PDQ(filter_gts=(args.test_set == 'rvc1'), segment_mode=args.segment_mode, greedy_mode=args.greedy_mode) pdq = evaluator.score(param_sequence) TP, FP, FN = evaluator.get_assignment_counts() avg_spatial_quality = evaluator.get_avg_spatial_score() avg_label_quality = evaluator.get_avg_label_score() avg_overall_quality = evaluator.get_avg_overall_quality_score() avg_fg_quality = evaluator.get_avg_fg_quality_score() avg_bg_quality = evaluator.get_avg_bg_quality_score() # Get the detection-wise and ground-truth-wise qualities and matches for PDQ and save them to file all_gt_eval_dicts = evaluator._gt_evals all_det_eval_dicts = evaluator._det_evals # Calculate mAP print("Calculating mAP") # generate the parameter sequence again for new tests (generator does not hold onto data once used) print("Extracting GT and Detections") param_sequence, len_sequences = gen_param_sequence() if args.mAP_heatmap: mAP = coco_mAP(param_sequence, use_heatmap=True) print('mAP: {0}'.format(mAP)) else: mAP = coco_mAP(param_sequence, use_heatmap=False) print('mAP: {0}'.format(mAP)) # Calculate LRP print("Calculating LRP") # generate the parameter sequence again for new tests (generator does not hold onto data once used) print("Extracting GT and Detections") param_sequence, len_sequences = gen_param_sequence() # Use same BBox definition as would be used for mAP # Extract all moLRP statistics if args.mAP_heatmap: LRP_dict = coco_LRP(param_sequence, use_heatmap=True, full=True) else: LRP_dict = coco_LRP(param_sequence, use_heatmap=False, full=True) # Compile evaluation statistics into a single dictionary result = { "PDQ": pdq, "avg_pPDQ": avg_overall_quality, "avg_spatial": avg_spatial_quality, 'avg_fg': avg_fg_quality, 'avg_bg': avg_bg_quality, "avg_label": avg_label_quality, "TP": TP, "FP": FP, "FN": FN, 'mAP': mAP, 'moLRP': LRP_dict['moLRP'], 'moLRPLoc': LRP_dict['moLRPLoc'], 'moLRPFP': LRP_dict['moLRPFP'], 'moLRPFN': LRP_dict['moLRPFN'] } print("PDQ: {0:4f}\n" "mAP: {1:4f}\n" "avg_pPDQ:{2:4f}\n" "avg_spatial:{3:4f}\n" "avg_label:{4:4f}\n" "avg_foreground:{5:4f}\n" "avg_background:{6:4f}\n" "TP:{7}\nFP:{8}\nFN:{9}\n" "moLRP:{10:4f}\n" "moLRPLoc:{11:4f}\n" "moLRPFP:{12:4f}\n" "moLRPFN:{13:4f}\n".format(pdq, mAP, avg_overall_quality, avg_spatial_quality, avg_label_quality, avg_fg_quality, avg_bg_quality, TP, FP, FN, LRP_dict['moLRP'], LRP_dict['moLRPLoc'], LRP_dict['moLRPFP'], LRP_dict['moLRPFN'])) # Save evaluation statistics to file with open(os.path.join(args.save_folder, 'scores.txt'), 'w') as output_file: output_file.write("\n".join("{0}:{1}".format(k, v) for k, v in sorted(result.items()))) # Save pairwise PDQ statistics to file for use in visualisation code (separate file for each sequence) prev_idx = 0 for idx, len_sequence in enumerate(len_sequences): seq_gt_eval_dicts = all_gt_eval_dicts[prev_idx:prev_idx + len_sequence] seq_det_eval_dicts = all_det_eval_dicts[prev_idx:prev_idx + len_sequence] prev_idx += len_sequence with open( os.path.join(args.save_folder, 'gt_eval_stats_{:02d}.json'.format(idx)), 'w') as f: json.dump(seq_gt_eval_dicts, f) with open( os.path.join(args.save_folder, 'det_eval_stats_{:02d}.json').format(idx), 'w') as f: json.dump(seq_det_eval_dicts, f)
def test_half_position_and_label_confidences(self): detections = [BBoxDetInst([0.5, 0.5], self.gt_box, 0.5)] evaluator = PDQ() score = evaluator.score([(self.square_gt, detections)]) self.assertAlmostEqual(0.5, score, 4)
def test_no_detection(self): detections = [] evaluator = PDQ() score = evaluator.score([(self.square_gt, detections)]) self.assertEqual(score, 0)
def test_cross_gt_detected_by_perfect_box_in_non_segment_mode(self): detections = [BBoxDetInst(self.cross_label_list, self.cross_gt_box)] evaluator = PDQ() score = evaluator.score([(self.cross_gt, detections)]) self.assertAlmostEqual(score, 1, 4)
def test_perfect_bbox(self): detections = [BBoxDetInst(self.default_label_list, self.gt_box)] evaluator = PDQ() score = evaluator.score([(self.square_gt, detections)]) self.assertAlmostEqual(score, 1, 4)