Ejemplo n.º 1
0
    def __handle_lt_figure(self, element: LTFigure):
        """
        sometimes pieces of text are wrongly detected as LTFigure, e.g. in slide-sets with border lines.
        -> extract text from LTFigure line by line put them into a LTTextBoxHorizontal as a workaround
        @return: LTTextBoxHorizontal containing found texts line by line
        """
        # check if text is hold within figure element, forward

        line = LTTextLineHorizontal(0)
        wrapper = LTTextBoxHorizontal()
        wrapper.add(line)

        y_prior = element._objs[0].y0

        for letter in element:
            if isinstance(letter, LTChar):
                if abs(letter.y0 - y_prior) > 0.05:
                    # new line, yield wrapper
                    wrapper.analyze(self.la_params)
                    yield wrapper

                    wrapper = LTTextBoxHorizontal()
                    line = LTTextLineHorizontal(0)
                    wrapper.add(line)
                    y_prior = letter.y0

                line.add(letter)
 def create_container(self, text):
     box = LTTextBoxHorizontal()
     line = LTTextLineHorizontal(0)
     for c in text:
         line.add(self.create_char(c))
     box.add(line)
     return box
Ejemplo n.º 3
0
    def test_bad_investor_info(self):
        from casparser.extract.pdfminer import parse_investor_info

        with pytest.raises(CASParseError) as exc_info:
            box = LTTextBoxHorizontal()
            box.get_text()
            parse_investor_info([], 0, 0)
        assert "Unable to parse investor data" in str(exc_info)
Ejemplo n.º 4
0
 def split_boxes_by_style(
     self, container: LTTextContainer
 ) -> Generator[LTTextContainer, LTTextContainer, None]:
     """
     pdfminers paragraphs are sometimes too broad and contain lines that should be splitted into header and content
     @param container: the extracted original paragraph
     """
     line: LTTextLineHorizontal
     wrapper = LTTextBoxHorizontal()
     wrapper.page = container.page
     stack = []
     for line in container:
         size = max([
             obj.size for obj in itertools.islice(line, 10)
             if isinstance(obj, LTChar)
         ])
         if not stack:
             wrapper.add(line)
             stack.append(size)
         else:
             prior = stack.pop()
             stack.append(size)
             diff = abs(prior - size)
             if diff != 0 and max(prior, size) / min(prior, size) > 1.15:
                 # break paragraph
                 yield wrapper
                 wrapper = LTTextBoxHorizontal()
             wrapper.add(line)
     yield wrapper
Ejemplo n.º 5
0
def group_textlines(self, laparams, lines):
    """Patched class method that fixes empty line aggregation, and allows
    run-time line margin detection"""
    plane = Plane(self.bbox)
    plane.extend(lines)
    boxes = {}
    for line in lines:
        neighbors = line.find_neighbors(plane, laparams.line_margin)
        if line not in neighbors or not line.get_text().strip():
            continue

        # Correct margin to paragraph specific
        true_margin = laparams.line_margin
        for obj1 in neighbors:
            if obj1 is line:
                continue
            margin = min(abs(obj1.y0 - line.y1), abs(obj1.y1 - line.y0))
            margin = margin * 1.05 / line.height
            if margin < true_margin:
                true_margin = margin

        neighbors = line.find_neighbors(plane, true_margin)
        if line not in neighbors:
            continue

        members = []
        for obj1 in neighbors:
            if not obj1.get_text().strip():
                continue
            members.append(obj1)
            if obj1 in boxes:
                members.extend(boxes.pop(obj1))
        if isinstance(line, LTTextLineHorizontal):
            box = LTTextBoxHorizontal()
        else:
            box = LTTextBoxVertical()
        for obj in uniq(members):
            box.add(obj)
            boxes[obj] = box
    done = set()
    for line in lines:
        if line not in boxes:
            continue
        box = boxes[line]
        if box in done:
            continue
        done.add(box)
        if not box.is_empty():
            yield box
    return
Ejemplo n.º 6
0
 def group_textlines(self, laparams: LAParams,
                     lines: List[LTTextContainer]) -> Generator:
     plane = Plane(self.bbox)
     plane.extend(lines)
     boxes: Dict[LTText, LTTextBox] = {}
     for line in lines:
         if isinstance(line, LTTextLineHorizontalExtended):
             box = LTTextBoxHorizontal()
             if self.rsrcmgr:
                 klass = line.maybe_classify(self.rsrcmgr)
                 if klass == LTTitle:
                     self.rsrcmgr.after_title = True
                 elif not self.rsrcmgr.after_abstract and klass == LTSectionHeader:
                     self.rsrcmgr.after_abstract = True
                 elif klass == LTSectionHeader and 'references' in line.get_text(
                 ).lower():
                     self.rsrcmgr.after_ref = True
                 box = klass()
         else:
             box = LTTextBoxVertical()
         if not isinstance(box, LTTitle) and not isinstance(
                 box, LTSectionHeader):
             neighbors = line.find_neighbors_with_rsrcmgr(
                 plane, laparams.line_margin, self.rsrcmgr)
             if line not in neighbors:
                 continue
         else:
             neighbors = [line]
         members = []
         for obj1 in neighbors:
             members.append(obj1)
             if obj1 in boxes:
                 members.extend(boxes.pop(obj1))
         for obj in uniq(members):
             box.add(obj)
             boxes[obj] = box
     done: Set[LTTextBox] = set()
     for line in lines:
         if line not in boxes:
             continue
         box = boxes[line]
         if box in done:
             continue
         done.add(box)
         if not box.is_empty():
             yield box
     return