Example #1
0
    def analyze(self, laparams):
        # textobjs is a list of LTChar objects, i.e.
        # it has all the individual characters in the page.
        (textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar),
                                       self._objs)
        if not textobjs: return
        textlines = list(self.get_textlines(laparams, textobjs))
        assert len(textobjs) <= sum(len(line._objs) for line in textlines)
        (empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines)
        textboxes = list(self.get_textboxes(laparams, textlines))
        assert len(textlines) == sum(len(box._objs) for box in textboxes)
        top = self.group_textboxes(laparams, textboxes)

        def assign_index(obj, i):
            if isinstance(obj, LTTextBox):
                obj.index = i
                i += 1
            elif isinstance(obj, LTTextGroup):
                for x in obj:
                    i = assign_index(x, i)
            return i

        assign_index(top, 0)
        textboxes.sort(key=lambda box: box.index)
        self._objs = textboxes + otherobjs + empties
        self.layout = top
        return self
Example #2
0
 def analyze(self, laparams):
     # textobjs is a list of LTChar objects, i.e.
     # it has all the individual characters in the page.
     (textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self._objs)
     for obj in otherobjs:
         obj.analyze(laparams)
     if not textobjs: return
     textlines = list(self.get_textlines(laparams, textobjs))
     assert len(textobjs) <= sum( len(line._objs) for line in textlines )
     (empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines)
     for obj in empties:
         obj.analyze(laparams)
     textboxes = list(self.get_textboxes(laparams, textlines))
     assert len(textlines) == sum( len(box._objs) for box in textboxes )
     self.groups = self.group_textboxes(laparams, textboxes)
     assigner = IndexAssigner()
     for group in self.groups:
         group.analyze(laparams)
         assigner.run(group)
     textboxes.sort(key=lambda box:box.index)
     self._objs = textboxes + otherobjs + empties
     return
Example #3
0
 def analyze(self, laparams):
     # textobjs is a list of LTChar objects, i.e.
     # it has all the individual characters in the page.
     (textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self)
     for obj in otherobjs:
         obj.analyze(laparams)
     if not textobjs:
         return
     textlines = list(self.group_objects(laparams, textobjs))
     (empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines)
     for obj in empties:
         obj.analyze(laparams)
     textboxes = list(self.group_textlines(laparams, textlines))
     if textboxes:
         self.groups = self.group_textboxes(laparams, textboxes)
         assigner = IndexAssigner()
         for group in self.groups:
             group.analyze(laparams)
             assigner.run(group)
         textboxes.sort(key=lambda box: box.index)
     self._objs = textboxes + otherobjs + empties
     return
Example #4
0
 def analyze(self, laparams):
     # textobjs is a list of LTChar objects, i.e.
     # it has all the individual characters in the page.
     (textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self._objs)
     if not textobjs: return
     textlines = list(self.get_textlines(laparams, textobjs))
     assert len(textobjs) <= sum( len(line._objs) for line in textlines )
     (empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines)
     textboxes = list(self.get_textboxes(laparams, textlines))
     assert len(textlines) == sum( len(box._objs) for box in textboxes )
     top = self.group_textboxes(laparams, textboxes)
     def assign_index(obj, i):
         if isinstance(obj, LTTextBox):
             obj.index = i
             i += 1
         elif isinstance(obj, LTTextGroup):
             for x in obj:
                 i = assign_index(x, i)
         return i
     assign_index(top, 0)
     textboxes.sort(key=lambda box:box.index)
     self._objs = textboxes + otherobjs + empties
     self.layout = top
     return self