def eval_cs2satt(path="cs2satt_good.pt"): net = ConvSeq2SeqAtt(nclass=100).float() optimizer = optim.Adam(net.parameters()) checkpoint = torch.load(path) net.load_state_dict(checkpoint["model_state_dict"]) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) epoch = checkpoint["epoch"] loss = checkpoint["loss"] print(f"model current epoch: {epoch} with loss: {loss}") net.eval() with torch.no_grad(): while 1: data = next(dataset) images = data["the_inputs"] labels = data["the_labels"] input_length = data["input_length"] label_length = data["label_length"] preds = net(images.float(), labels, 0).detach().permute(1, 0, 2) for i in range(len(preds)): print("labels", labels[i]) print("preds", preds[i].argmax(1)) print(labels_to_text(preds[i, :, :].argmax(1), string.printable)) cv2.imshow("im", images[i].permute(1, 2, 0).numpy()) cv2.waitKey(0) cv2.destroyAllWindows()
def decode_batch(test_func, word_batch): out = test_func([word_batch])[0] ret = [] for j in range(out.shape[0]): out_best = list(np.argmax(out[j, 2:], 1)) out_best = [k for k, g in itertools.groupby(out_best)] outstr = labels_to_text(out_best, ALPHABET) ret.append(outstr) return ret
def show_edit_distance(self, num): num_left = num mean_norm_ed = 0.0 mean_ed = 0.0 while num_left > 0: word_batch = next(self.text_img_gen)[0] num_proc = min(word_batch["the_input"].shape[0], num_left) decoded_res = decode_batch( self.test_func, word_batch["the_input"][0:num_proc] ) for j in range(num_proc): pred = decoded_res[j].strip() truth = labels_to_text(word_batch["the_labels"][j], ALPHABET) edit_dist = editdistance.eval(pred, truth) mean_ed += float(edit_dist) mean_norm_ed += float(edit_dist) / max(len(truth), len(pred)) num_left -= num_proc mean_norm_ed = mean_norm_ed / num mean_ed = mean_ed / num print( "\nOut of %d samples: Mean edit distance: " "%.3f / Mean normalized edit distance: %0.3f" % (num, mean_ed, mean_norm_ed) )
def calculate_map(all_detections, all_text_preds, all_annotations, all_text_annots, binary, generator, retinanet, iou_threshold=0.5, score_threshold=0.05, max_detections=400, save_path=None): average_precisions = {} cers = [] if binary: n_classes = 1 else: n_classes = generator.num_classes() for label in range(n_classes): #generator.num_classes()): false_positives = np.zeros((0, )) true_positives = np.zeros((0, )) scores = np.zeros((0, )) num_annotations = 0.0 for i in range(len(generator)): if binary: detections = np.concatenate( all_detections[i][:]) #all_detections[i][label] annotations = np.concatenate(all_annotations[i][:]) #label] text_dets = np.concatenate([ all_text_preds.get(i, {}).get(lab, {}) for lab in range(generator.num_classes()) ]) text_annots = np.concatenate([ all_text_annots.get(i, {}).get(lab, {}) for lab in range(generator.num_classes()) ]) else: detections = all_detections[i][label] annotations = all_annotations[i][label] text_dets = all_text_preds.get(i, {}).get(label, torch.zeros(1)) text_annots = all_text_annots[i][label] num_annotations += annotations.shape[0] detected_annotations = [] for j in range(len(detections)): #for d in detections: d = detections[j] scores = np.append(scores, d[4]) if annotations.shape[0] == 0: false_positives = np.append(false_positives, 1) true_positives = np.append(true_positives, 0) continue overlaps = compute_overlap(np.expand_dims(d, axis=0), annotations) assigned_annotation = np.argmax(overlaps, axis=1) max_overlap = overlaps[0, assigned_annotation] if max_overlap >= iou_threshold and assigned_annotation not in detected_annotations: false_positives = np.append(false_positives, 0) true_positives = np.append(true_positives, 1) if retinanet.module.binary_classifier: pred_str = labels_to_text(text_dets[j].astype('int'), retinanet.module.alphabet) lab_str = labels_to_text( text_annots[assigned_annotation[0]].astype('int'), retinanet.module.alphabet) else: pred_str = labels_to_text(text_dets[j].astype('int'), retinanet.module.alphabet) lab_str = labels_to_text( text_annots[assigned_annotation[0]].astype('int'), retinanet.module.alphabet) if len(lab_str) > 0: cer = float( editdistance.eval(pred_str, lab_str) / len(lab_str)) else: cer = 1. cers.append(cer) detected_annotations.append(assigned_annotation) else: false_positives = np.append(false_positives, 1) true_positives = np.append(true_positives, 0) # no annotations -> AP for this class is 0 (is this correct?) if num_annotations == 0: average_precisions[label] = 0, 0 continue # sort by score indices = np.argsort(-scores) false_positives = false_positives[indices] true_positives = true_positives[indices] # compute false positives and true positives false_positives = np.cumsum(false_positives) true_positives = np.cumsum(true_positives) # compute recall and precision recall = true_positives / num_annotations precision = true_positives / np.maximum( true_positives + false_positives, np.finfo(np.float64).eps) # compute average precision average_precision = _compute_ap(recall, precision) average_precisions[label] = average_precision, num_annotations if binary: #retinanet.module.binary_classifier: for label in range(1): #generator.num_classes()): label_name = 'Text' #generator.label_to_name(label) print('{}: {}'.format(label_name, average_precisions[label][0])) else: for label in range(generator.num_classes()): label_name = generator.label_to_name(label) print('{}: {}'.format(label_name, average_precisions[label][0])) mAPs = [] for k, v in average_precisions.items(): mAPs.append(v[0]) mAP = np.mean(mAPs) print('mAP', mAP) print("Per box CER", np.mean(cers)) cer = np.mean(cers) return mAP, np.mean(cers)
def generate_pagexml(image_id, data, retinanet, score_threshold, nms_threshold, dataset_val): image_name = image_id + '.jpg' im_file_out = 'pagexmls/' + image_name alphabet = retinanet.alphabet #retinanet.score_threshold = torch.tensor(score_threshold).cuda().float() colors = get_n_random_colors(len(dataset_val.labels)) gtxml_name = os.path.join(image_name.split('/')[-1].split('.')[-2]) pxml = pagexml.PageXML() unnormalize = UnNormalizer() with torch.no_grad(): st = time.time() im = data['img'] im = im.cuda().float() print(retinanet.htr_gt_box) if retinanet.htr_gt_box: scores, classification, transformed_anchors, transcriptions = retinanet( [im, data['annot']]) score_threshold = 0 else: scores, classification, transformed_anchors, transcriptions = retinanet( im) n_boxes_predicted = transformed_anchors.shape[0] print(n_boxes_predicted, "BOXES PREDICTED") img = np.array(255 * unnormalize(data['img'][0, :, :, :])).copy() img[img < 0] = 0 img[img > 255] = 255 img = np.transpose(img, (1, 2, 0)) img = cv2.cvtColor(img.astype(np.uint8), cv2.COLOR_BGR2RGB) width = img.shape[1] height = img.shape[0] cv2.imwrite(im_file_out, img) conf = pagexml.ptr_double() pxml.newXml('retinanet_dets', image_name, width, height) page = pxml.selectNth("//_:Page", 0) reg = pxml.addTextRegion(page) pxml.setCoordsBBox(reg, 0, 0, width, height, conf) line = pxml.addTextLine(reg) pxml.setCoordsBBox(line, 0, 0, width, height, conf) words = [] for k in range(len(dataset_val.labels)): cv2.putText(img, dataset_val.labels[k], (25, 25 + k * 15), cv2.FONT_HERSHEY_PLAIN, 1, colors[k], 2) transcriptions = np.argmax(transcriptions.cpu(), axis=-1) for box_id in range(n_boxes_predicted): # Initialize object for setting confidence values box = {} bbox = transformed_anchors[box_id, :] transcription = transcriptions[box_id, :] x1 = int(bbox[0]) y1 = int(bbox[1]) x2 = int(bbox[2]) y2 = int(bbox[3]) label_name = dataset_val.labels[int(classification[box_id])] cv2.rectangle(img, (x1, y1), (x2, y2), color=colors[int(classification[box_id])], thickness=2) # Add a text region to the Page word = pxml.addWord(line, "ID" + str(box_id)) # Set text region bounding box with a confidence pxml.setCoordsBBox(word, x1, y1, x2 - x1, y2 - y1, conf) #pxml.setCoordsBBox( reg,x1, y1, x2-x1, y2-y1, conf ) #transcription = transcripts[j] transcription = labels_to_text(transcription, alphabet) draw_caption(img, (x1, y1, x2, y2), transcription) # Set the text for the text region conf.assign(0.9) pxml.setTextEquiv(word, transcription, conf) # Add property to text region pxml.setProperty(word, "category", label_name) # Add a second page with a text region and specific id #page = pxml.addPage("example_image_2.jpg", 300, 300) #reg = pxml.addTextRegion( page, "regA" ) #pxml.setCoordsBBox( reg, 15, 12, 76, 128 ) words.append(word) words = pxml.select('//_:Word') order, groups = pxml.getLeftRightTopBottomReadingOrder( words, fake_baseline=True, max_horiz_iou=1, prolong_alpha=0.0) line = pxml.selectNth('//_:TextLine') group_idx = 0 idx_in_group = 0 for n in order: word_idx = order.index(n) if idx_in_group >= groups[group_idx]: group_idx += 1 idx_in_group = 0 pxml.setProperty(words[n], 'word_idx', str(word_idx)) pxml.setProperty(words[n], "line", str(group_idx)) pxml.moveElem(words[n], line) idx_in_group += 1 # Write XML to file pxml.write('pagexmls/' + gtxml_name + ".xml") cv2.imwrite(os.path.join('pred_sample_ims', str(image_id) + '.jpg'), img)
def evaluate(generator, retinanet, iou_threshold=0.5, score_threshold=0.05, max_detections=400, save_path=None): """ Evaluate a given dataset using a given retinanet. # Arguments generator : The generator that represents the dataset to evaluate. retinanet : The retinanet to evaluate. iou_threshold : The threshold used to consider when a detection is positive or negative. score_threshold : The score confidence threshold to use for detections. max_detections : The maximum number of detections to use per image. save_path : The path to save images with visualized detections to. # Returns A dict mapping class names to mAP scores. """ # gather all detections and annotations all_detections, all_text_preds = _get_detections( generator, retinanet, score_threshold=score_threshold, max_detections=max_detections, save_path=save_path) all_annotations, all_text_annots = _get_annotations(generator) average_precisions = {} cers = [] for label in range(generator.num_classes()): false_positives = np.zeros((0, )) true_positives = np.zeros((0, )) scores = np.zeros((0, )) num_annotations = 0.0 for i in range(len(generator)): detections = all_detections[i][label] annotations = all_annotations[i][label] text_dets = all_text_preds[i][label] text_annots = all_text_annots[i][label] num_annotations += annotations.shape[0] detected_annotations = [] for j in range(len(detections)): #for d in detections: d = detections[j] scores = np.append(scores, d[4]) if annotations.shape[0] == 0: false_positives = np.append(false_positives, 1) true_positives = np.append(true_positives, 0) continue overlaps = compute_overlap(np.expand_dims(d, axis=0), annotations) assigned_annotation = np.argmax(overlaps, axis=1) max_overlap = overlaps[0, assigned_annotation] if max_overlap >= iou_threshold and assigned_annotation not in detected_annotations: false_positives = np.append(false_positives, 0) true_positives = np.append(true_positives, 1) pred_str = labels_to_text(text_dets[j].astype('int'), retinanet.module.alphabet) lab_str = labels_to_text( text_annots[assigned_annotation[0]].astype('int'), retinanet.module.alphabet) if len(lab_str) > 0: cer = float( editdistance.eval(pred_str, lab_str) / len(lab_str)) cers.append(cer) print("CER", cer) detected_annotations.append(assigned_annotation) else: false_positives = np.append(false_positives, 1) true_positives = np.append(true_positives, 0) # no annotations -> AP for this class is 0 (is this correct?) if num_annotations == 0: average_precisions[label] = 0, 0 continue # sort by score indices = np.argsort(-scores) false_positives = false_positives[indices] true_positives = true_positives[indices] # compute false positives and true positives false_positives = np.cumsum(false_positives) true_positives = np.cumsum(true_positives) # compute recall and precision recall = true_positives / num_annotations precision = true_positives / np.maximum( true_positives + false_positives, np.finfo(np.float64).eps) # compute average precision average_precision = _compute_ap(recall, precision) average_precisions[label] = average_precision, num_annotations print('\nmAP:') for label in range(generator.num_classes()): label_name = generator.label_to_name(label) print('{}: {}'.format(label_name, average_precisions[label][0])) print("Per box CER", np.mean(cers)) return average_precisions