Exemple #1
0
    def display_words(
        self,
        sentences: List[Sentence],
        target: Optional[str] = None,
        pdf_file: Optional[str] = None,
    ) -> DisplayHandle:
        """Display the bounding boxes of words.

        Display the bounding boxes corresponding to words on the pdf.
        """
        if not pdf_file:
            pdf_file = os.path.join(self.pdf_path,
                                    sentences[0].document.name + ".pdf")
        boxes = []
        for sentence in sentences:
            for i, word in enumerate(sentence.words):
                if target is None or word == target:
                    boxes.append(
                        Bbox(
                            sentence.page[i],
                            sentence.top[i],
                            sentence.bottom[i],
                            sentence.left[i],
                            sentence.right[i],
                        ))
        imgs = self.display_boxes(pdf_file, boxes)
        return display(*imgs)
Exemple #2
0
 def get_bbox(self) -> Bbox:
     """Get the bounding box."""
     if self.sentence.is_visual():
         return Bbox(
             self.get_attrib_tokens("page")[0],
             min(self.get_attrib_tokens("top")),
             max(self.get_attrib_tokens("bottom")),
             min(self.get_attrib_tokens("left")),
             max(self.get_attrib_tokens("right")),
         )
     else:
         return None
Exemple #3
0
def get_box(span: SpanMention) -> Bbox:
    """Get the bounding box."""
    warnings.warn(
        "get_box(span) is deprecated. Use span.get_bbox() instead.",
        DeprecationWarning,
    )
    return Bbox(
        min(span.get_attrib_tokens("page")),
        min(span.get_attrib_tokens("top")),
        max(span.get_attrib_tokens("bottom")),
        min(span.get_attrib_tokens("left")),
        max(span.get_attrib_tokens("right")),
    )
Exemple #4
0
 def get_bbox(self) -> Bbox:
     """Get the bounding box."""
     # TODO: this may have issues where a sentence is linked to words on different
     # pages
     if self.is_visual():
         return Bbox(
             self.page[0],
             min(self.top),
             max(self.bottom),
             min(self.left),
             max(self.right),
         )
     else:
         return None
Exemple #5
0
 def _coordinates_from_HTML(
     self, page: Tag, page_num: int
 ) -> Tuple[List[Tuple[Tuple[int, int], str]], Dict[Tuple[int, int],
                                                    Bbox], ]:
     pdf_word_list: List[Tuple[Tuple[int, int], str]] = []
     coordinate_map: Dict[Tuple[int, int], Bbox] = {}
     block_coordinates = {}
     blocks = page.find_all("block")
     i = 0  # counter for word_id in page_num
     for block in blocks:
         x_min_block = int(float(block.get("xmin")))
         y_min_block = int(float(block.get("ymin")))
         lines = block.find_all("line")
         for line in lines:
             y_min_line = int(float(line.get("ymin")))
             y_max_line = int(float(line.get("ymax")))
             words = line.find_all("word")
             for word in words:
                 xmin = int(float(word.get("xmin")))
                 xmax = int(float(word.get("xmax")))
                 for content in self.separators.split(word.getText()):
                     if len(content) > 0:  # Ignore empty characters
                         word_id = (page_num, i)
                         pdf_word_list.append((word_id, content))
                         coordinate_map[word_id] = Bbox(
                             page_num,
                             y_min_line,
                             y_max_line,
                             xmin,
                             xmax,
                         )
                         block_coordinates[word_id] = (y_min_block,
                                                       x_min_block)
                         i += 1
     # sort pdf_word_list by page, block top then block left, top, then left
     pdf_word_list = sorted(
         pdf_word_list,
         key=lambda word_id__: block_coordinates[word_id__[0]] +
         coordinate_map[word_id__[0]][1:3],
     )
     return pdf_word_list, coordinate_map