def eval_cs2satt(path="cs2satt_good.pt"):
    net = ConvSeq2SeqAtt(nclass=100).float()
    optimizer = optim.Adam(net.parameters())

    checkpoint = torch.load(path)
    net.load_state_dict(checkpoint["model_state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
    epoch = checkpoint["epoch"]
    loss = checkpoint["loss"]
    print(f"model current epoch: {epoch} with loss: {loss}")

    net.eval()

    with torch.no_grad():
        while 1:
            data = next(dataset)
            images = data["the_inputs"]
            labels = data["the_labels"]
            input_length = data["input_length"]
            label_length = data["label_length"]

            preds = net(images.float(), labels, 0).detach().permute(1, 0, 2)
            for i in range(len(preds)):
                print("labels", labels[i])
                print("preds", preds[i].argmax(1))
                print(labels_to_text(preds[i, :, :].argmax(1), string.printable))
                cv2.imshow("im", images[i].permute(1, 2, 0).numpy())
                cv2.waitKey(0)
                cv2.destroyAllWindows()
def decode_batch(test_func, word_batch):
    out = test_func([word_batch])[0]
    ret = []
    for j in range(out.shape[0]):
        out_best = list(np.argmax(out[j, 2:], 1))
        out_best = [k for k, g in itertools.groupby(out_best)]
        outstr = labels_to_text(out_best, ALPHABET)
        ret.append(outstr)
    return ret
 def show_edit_distance(self, num):
     num_left = num
     mean_norm_ed = 0.0
     mean_ed = 0.0
     while num_left > 0:
         word_batch = next(self.text_img_gen)[0]
         num_proc = min(word_batch["the_input"].shape[0], num_left)
         decoded_res = decode_batch(
             self.test_func, word_batch["the_input"][0:num_proc]
         )
         for j in range(num_proc):
             pred = decoded_res[j].strip()
             truth = labels_to_text(word_batch["the_labels"][j], ALPHABET)
             edit_dist = editdistance.eval(pred, truth)
             mean_ed += float(edit_dist)
             mean_norm_ed += float(edit_dist) / max(len(truth), len(pred))
         num_left -= num_proc
     mean_norm_ed = mean_norm_ed / num
     mean_ed = mean_ed / num
     print(
         "\nOut of %d samples:  Mean edit distance: "
         "%.3f / Mean normalized edit distance: %0.3f" % (num, mean_ed, mean_norm_ed)
     )
def calculate_map(all_detections,
                  all_text_preds,
                  all_annotations,
                  all_text_annots,
                  binary,
                  generator,
                  retinanet,
                  iou_threshold=0.5,
                  score_threshold=0.05,
                  max_detections=400,
                  save_path=None):
    average_precisions = {}
    cers = []
    if binary:
        n_classes = 1
    else:
        n_classes = generator.num_classes()
    for label in range(n_classes):  #generator.num_classes()):
        false_positives = np.zeros((0, ))
        true_positives = np.zeros((0, ))
        scores = np.zeros((0, ))
        num_annotations = 0.0

        for i in range(len(generator)):
            if binary:
                detections = np.concatenate(
                    all_detections[i][:])  #all_detections[i][label]
                annotations = np.concatenate(all_annotations[i][:])  #label]
                text_dets = np.concatenate([
                    all_text_preds.get(i, {}).get(lab, {})
                    for lab in range(generator.num_classes())
                ])
                text_annots = np.concatenate([
                    all_text_annots.get(i, {}).get(lab, {})
                    for lab in range(generator.num_classes())
                ])
            else:
                detections = all_detections[i][label]
                annotations = all_annotations[i][label]
                text_dets = all_text_preds.get(i,
                                               {}).get(label, torch.zeros(1))
                text_annots = all_text_annots[i][label]

            num_annotations += annotations.shape[0]
            detected_annotations = []
            for j in range(len(detections)):
                #for d in detections:
                d = detections[j]
                scores = np.append(scores, d[4])

                if annotations.shape[0] == 0:
                    false_positives = np.append(false_positives, 1)
                    true_positives = np.append(true_positives, 0)
                    continue

                overlaps = compute_overlap(np.expand_dims(d, axis=0),
                                           annotations)
                assigned_annotation = np.argmax(overlaps, axis=1)
                max_overlap = overlaps[0, assigned_annotation]

                if max_overlap >= iou_threshold and assigned_annotation not in detected_annotations:
                    false_positives = np.append(false_positives, 0)
                    true_positives = np.append(true_positives, 1)
                    if retinanet.module.binary_classifier:
                        pred_str = labels_to_text(text_dets[j].astype('int'),
                                                  retinanet.module.alphabet)
                        lab_str = labels_to_text(
                            text_annots[assigned_annotation[0]].astype('int'),
                            retinanet.module.alphabet)
                    else:
                        pred_str = labels_to_text(text_dets[j].astype('int'),
                                                  retinanet.module.alphabet)
                        lab_str = labels_to_text(
                            text_annots[assigned_annotation[0]].astype('int'),
                            retinanet.module.alphabet)
                    if len(lab_str) > 0:
                        cer = float(
                            editdistance.eval(pred_str, lab_str) /
                            len(lab_str))
                    else:
                        cer = 1.
                    cers.append(cer)

                    detected_annotations.append(assigned_annotation)
                else:
                    false_positives = np.append(false_positives, 1)
                    true_positives = np.append(true_positives, 0)

        # no annotations -> AP for this class is 0 (is this correct?)
        if num_annotations == 0:
            average_precisions[label] = 0, 0
            continue
        # sort by score
        indices = np.argsort(-scores)
        false_positives = false_positives[indices]
        true_positives = true_positives[indices]

        # compute false positives and true positives
        false_positives = np.cumsum(false_positives)
        true_positives = np.cumsum(true_positives)

        # compute recall and precision
        recall = true_positives / num_annotations
        precision = true_positives / np.maximum(
            true_positives + false_positives,
            np.finfo(np.float64).eps)

        # compute average precision
        average_precision = _compute_ap(recall, precision)
        average_precisions[label] = average_precision, num_annotations

    if binary:  #retinanet.module.binary_classifier:
        for label in range(1):  #generator.num_classes()):
            label_name = 'Text'  #generator.label_to_name(label)
            print('{}: {}'.format(label_name, average_precisions[label][0]))

    else:
        for label in range(generator.num_classes()):
            label_name = generator.label_to_name(label)
            print('{}: {}'.format(label_name, average_precisions[label][0]))

    mAPs = []
    for k, v in average_precisions.items():
        mAPs.append(v[0])
    mAP = np.mean(mAPs)
    print('mAP', mAP)
    print("Per box CER", np.mean(cers))
    cer = np.mean(cers)
    return mAP, np.mean(cers)
Beispiel #5
0
def generate_pagexml(image_id, data, retinanet, score_threshold, nms_threshold,
                     dataset_val):
    image_name = image_id + '.jpg'
    im_file_out = 'pagexmls/' + image_name
    alphabet = retinanet.alphabet
    #retinanet.score_threshold = torch.tensor(score_threshold).cuda().float()
    colors = get_n_random_colors(len(dataset_val.labels))
    gtxml_name = os.path.join(image_name.split('/')[-1].split('.')[-2])

    pxml = pagexml.PageXML()
    unnormalize = UnNormalizer()
    with torch.no_grad():
        st = time.time()
        im = data['img']

        im = im.cuda().float()
        print(retinanet.htr_gt_box)
        if retinanet.htr_gt_box:
            scores, classification, transformed_anchors, transcriptions = retinanet(
                [im, data['annot']])
            score_threshold = 0
        else:
            scores, classification, transformed_anchors, transcriptions = retinanet(
                im)

        n_boxes_predicted = transformed_anchors.shape[0]
        print(n_boxes_predicted, "BOXES PREDICTED")

        img = np.array(255 * unnormalize(data['img'][0, :, :, :])).copy()

        img[img < 0] = 0
        img[img > 255] = 255

        img = np.transpose(img, (1, 2, 0))

        img = cv2.cvtColor(img.astype(np.uint8), cv2.COLOR_BGR2RGB)
        width = img.shape[1]
        height = img.shape[0]
        cv2.imwrite(im_file_out, img)

        conf = pagexml.ptr_double()
        pxml.newXml('retinanet_dets', image_name, width, height)
        page = pxml.selectNth("//_:Page", 0)
        reg = pxml.addTextRegion(page)
        pxml.setCoordsBBox(reg, 0, 0, width, height, conf)
        line = pxml.addTextLine(reg)
        pxml.setCoordsBBox(line, 0, 0, width, height, conf)
        words = []
        for k in range(len(dataset_val.labels)):
            cv2.putText(img, dataset_val.labels[k], (25, 25 + k * 15),
                        cv2.FONT_HERSHEY_PLAIN, 1, colors[k], 2)
        transcriptions = np.argmax(transcriptions.cpu(), axis=-1)
        for box_id in range(n_boxes_predicted):

            # Initialize object for setting confidence values
            box = {}
            bbox = transformed_anchors[box_id, :]
            transcription = transcriptions[box_id, :]
            x1 = int(bbox[0])
            y1 = int(bbox[1])
            x2 = int(bbox[2])
            y2 = int(bbox[3])
            label_name = dataset_val.labels[int(classification[box_id])]

            cv2.rectangle(img, (x1, y1), (x2, y2),
                          color=colors[int(classification[box_id])],
                          thickness=2)

            # Add a text region to the Page
            word = pxml.addWord(line, "ID" + str(box_id))

            # Set text region bounding box with a confidence
            pxml.setCoordsBBox(word, x1, y1, x2 - x1, y2 - y1, conf)

            #pxml.setCoordsBBox( reg,x1, y1, x2-x1, y2-y1, conf )
            #transcription = transcripts[j]
            transcription = labels_to_text(transcription, alphabet)
            draw_caption(img, (x1, y1, x2, y2), transcription)

            # Set the text for the text region
            conf.assign(0.9)
            pxml.setTextEquiv(word, transcription, conf)

            # Add property to text region
            pxml.setProperty(word, "category", label_name)

            # Add a second page with a text region and specific id
            #page = pxml.addPage("example_image_2.jpg", 300, 300)
            #reg = pxml.addTextRegion( page, "regA" )
            #pxml.setCoordsBBox( reg, 15, 12, 76, 128 )
            words.append(word)
        words = pxml.select('//_:Word')
        order, groups = pxml.getLeftRightTopBottomReadingOrder(
            words, fake_baseline=True, max_horiz_iou=1, prolong_alpha=0.0)
        line = pxml.selectNth('//_:TextLine')
        group_idx = 0
        idx_in_group = 0
        for n in order:
            word_idx = order.index(n)
            if idx_in_group >= groups[group_idx]:
                group_idx += 1
                idx_in_group = 0

            pxml.setProperty(words[n], 'word_idx', str(word_idx))
            pxml.setProperty(words[n], "line", str(group_idx))
            pxml.moveElem(words[n], line)
            idx_in_group += 1

        # Write XML to file
        pxml.write('pagexmls/' + gtxml_name + ".xml")
        cv2.imwrite(os.path.join('pred_sample_ims',
                                 str(image_id) + '.jpg'), img)
Beispiel #6
0
def evaluate(generator,
             retinanet,
             iou_threshold=0.5,
             score_threshold=0.05,
             max_detections=400,
             save_path=None):
    """ Evaluate a given dataset using a given retinanet.
    # Arguments
        generator       : The generator that represents the dataset to evaluate.
        retinanet           : The retinanet to evaluate.
        iou_threshold   : The threshold used to consider when a detection is positive or negative.
        score_threshold : The score confidence threshold to use for detections.
        max_detections  : The maximum number of detections to use per image.
        save_path       : The path to save images with visualized detections to.
    # Returns
        A dict mapping class names to mAP scores.
    """

    # gather all detections and annotations
    all_detections, all_text_preds = _get_detections(
        generator,
        retinanet,
        score_threshold=score_threshold,
        max_detections=max_detections,
        save_path=save_path)
    all_annotations, all_text_annots = _get_annotations(generator)
    average_precisions = {}
    cers = []
    for label in range(generator.num_classes()):
        false_positives = np.zeros((0, ))
        true_positives = np.zeros((0, ))
        scores = np.zeros((0, ))
        num_annotations = 0.0

        for i in range(len(generator)):
            detections = all_detections[i][label]
            annotations = all_annotations[i][label]
            text_dets = all_text_preds[i][label]
            text_annots = all_text_annots[i][label]
            num_annotations += annotations.shape[0]
            detected_annotations = []
            for j in range(len(detections)):
                #for d in detections:
                d = detections[j]
                scores = np.append(scores, d[4])

                if annotations.shape[0] == 0:
                    false_positives = np.append(false_positives, 1)
                    true_positives = np.append(true_positives, 0)
                    continue

                overlaps = compute_overlap(np.expand_dims(d, axis=0),
                                           annotations)
                assigned_annotation = np.argmax(overlaps, axis=1)
                max_overlap = overlaps[0, assigned_annotation]

                if max_overlap >= iou_threshold and assigned_annotation not in detected_annotations:
                    false_positives = np.append(false_positives, 0)
                    true_positives = np.append(true_positives, 1)
                    pred_str = labels_to_text(text_dets[j].astype('int'),
                                              retinanet.module.alphabet)
                    lab_str = labels_to_text(
                        text_annots[assigned_annotation[0]].astype('int'),
                        retinanet.module.alphabet)
                    if len(lab_str) > 0:
                        cer = float(
                            editdistance.eval(pred_str, lab_str) /
                            len(lab_str))

                    cers.append(cer)
                    print("CER", cer)
                    detected_annotations.append(assigned_annotation)
                else:
                    false_positives = np.append(false_positives, 1)
                    true_positives = np.append(true_positives, 0)

        # no annotations -> AP for this class is 0 (is this correct?)
        if num_annotations == 0:
            average_precisions[label] = 0, 0
            continue
        # sort by score
        indices = np.argsort(-scores)
        false_positives = false_positives[indices]
        true_positives = true_positives[indices]

        # compute false positives and true positives
        false_positives = np.cumsum(false_positives)
        true_positives = np.cumsum(true_positives)

        # compute recall and precision
        recall = true_positives / num_annotations
        precision = true_positives / np.maximum(
            true_positives + false_positives,
            np.finfo(np.float64).eps)

        # compute average precision
        average_precision = _compute_ap(recall, precision)
        average_precisions[label] = average_precision, num_annotations

    print('\nmAP:')
    for label in range(generator.num_classes()):
        label_name = generator.label_to_name(label)
        print('{}: {}'.format(label_name, average_precisions[label][0]))
    print("Per box CER", np.mean(cers))
    return average_precisions