def get_words_in_bounding_boxes(extracted_bboxes, gt_bboxes, chars, page_num):
    if page_num in extracted_bboxes.keys():
        extracted_bbox = extracted_bboxes[page_num]
    else:
        extracted_bbox = []
    if page_num in gt_bboxes.keys():
        gt_bbox = gt_bboxes[page_num]
    else:
        gt_bbox = []
    extracted_chars = []
    gt_chars = []
    for i, c in enumerate(chars):
        try:
            if any([
                    isContained((c.y0, c.x0, c.y1, c.x1), bbox)
                    for bbox in extracted_bbox
            ]):
                extracted_chars += [i]
            if any([
                    isContained((c.y0, c.x0, c.y1, c.x1), bbox)
                    for bbox in gt_bbox
            ]):
                gt_chars += [i]
        except AttributeError:
            pass
    return extracted_chars, gt_chars
Esempio n. 2
0
def compute_overlap_matrix(pdf_bboxes, iou_thresh):
    nb_tables = len(pdf_bboxes)
    overlap = np.zeros((nb_tables, nb_tables))
    for i, bb1 in enumerate(pdf_bboxes):
        for j, bb2 in enumerate(pdf_bboxes):
            if i != j and bb1[0] == bb2[0] and doOverlap(bb1[-4:], bb2[-4:]):
                iou = compute_iou(bb1[-4:], bb2[-4:])
                if iou > iou_thresh or isContained(
                        bb1[-4:], bb2[-4:]) or isContained(bb2[-4:], bb1[-4:]):
                    overlap[i, j] = 1.
    return overlap
Esempio n. 3
0
def get_lines_within_bbox(bbox, segments):
    lines_within_bbox = []
    for line in segments:
        bbox_line = (int(line.y0), int(line.x0), int(line.y1), int(line.x1))
        if isContained(bbox_line, bbox[-4:]):
            lines_within_bbox += [line]
    return lines_within_bbox
Esempio n. 4
0
def get_mentions_within_bbox(bbox, mentions):
    mentions_within_bbox = []
    for mention in mentions:
        bbox_mention = (int(mention.y0), int(mention.x0), int(mention.y1),
                        int(mention.x1))
        if isContained(bbox_mention, bbox[-4:]):
            mentions_within_bbox += [mention]
    return mentions_within_bbox