def display_words( self, sentences: List[Sentence], target: Optional[str] = None, pdf_file: Optional[str] = None, ) -> DisplayHandle: """Display the bounding boxes of words. Display the bounding boxes corresponding to words on the pdf. """ if not pdf_file: pdf_file = os.path.join(self.pdf_path, sentences[0].document.name + ".pdf") boxes = [] for sentence in sentences: for i, word in enumerate(sentence.words): if target is None or word == target: boxes.append( Bbox( sentence.page[i], sentence.top[i], sentence.bottom[i], sentence.left[i], sentence.right[i], )) imgs = self.display_boxes(pdf_file, boxes) return display(*imgs)
def get_bbox(self) -> Bbox: """Get the bounding box.""" if self.sentence.is_visual(): return Bbox( self.get_attrib_tokens("page")[0], min(self.get_attrib_tokens("top")), max(self.get_attrib_tokens("bottom")), min(self.get_attrib_tokens("left")), max(self.get_attrib_tokens("right")), ) else: return None
def get_box(span: SpanMention) -> Bbox: """Get the bounding box.""" warnings.warn( "get_box(span) is deprecated. Use span.get_bbox() instead.", DeprecationWarning, ) return Bbox( min(span.get_attrib_tokens("page")), min(span.get_attrib_tokens("top")), max(span.get_attrib_tokens("bottom")), min(span.get_attrib_tokens("left")), max(span.get_attrib_tokens("right")), )
def get_bbox(self) -> Bbox: """Get the bounding box.""" # TODO: this may have issues where a sentence is linked to words on different # pages if self.is_visual(): return Bbox( self.page[0], min(self.top), max(self.bottom), min(self.left), max(self.right), ) else: return None
def _coordinates_from_HTML( self, page: Tag, page_num: int ) -> Tuple[List[Tuple[Tuple[int, int], str]], Dict[Tuple[int, int], Bbox], ]: pdf_word_list: List[Tuple[Tuple[int, int], str]] = [] coordinate_map: Dict[Tuple[int, int], Bbox] = {} block_coordinates = {} blocks = page.find_all("block") i = 0 # counter for word_id in page_num for block in blocks: x_min_block = int(float(block.get("xmin"))) y_min_block = int(float(block.get("ymin"))) lines = block.find_all("line") for line in lines: y_min_line = int(float(line.get("ymin"))) y_max_line = int(float(line.get("ymax"))) words = line.find_all("word") for word in words: xmin = int(float(word.get("xmin"))) xmax = int(float(word.get("xmax"))) for content in self.separators.split(word.getText()): if len(content) > 0: # Ignore empty characters word_id = (page_num, i) pdf_word_list.append((word_id, content)) coordinate_map[word_id] = Bbox( page_num, y_min_line, y_max_line, xmin, xmax, ) block_coordinates[word_id] = (y_min_block, x_min_block) i += 1 # sort pdf_word_list by page, block top then block left, top, then left pdf_word_list = sorted( pdf_word_list, key=lambda word_id__: block_coordinates[word_id__[0]] + coordinate_map[word_id__[0]][1:3], ) return pdf_word_list, coordinate_map