def group_textboxes(self, laparams, boxes): assert boxes def dist(obj1, obj2): """A distance function between two TextBoxes. Consider the bounding rectangle for obj1 and obj2. Return its area less the areas of obj1 and obj2, shown as 'www' below. This value may be negative. +------+..........+ (x1, y1) | obj1 |wwwwwwwwww: +------+www+------+ :wwwwwwwwww| obj2 | (x0, y0) +..........+------+ """ x0 = min(obj1.x0, obj2.x0) y0 = min(obj1.y0, obj2.y0) x1 = max(obj1.x1, obj2.x1) y1 = max(obj1.y1, obj2.y1) return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height) def isany(obj1, obj2): """Check if there's any other object between obj1 and obj2. """ x0 = min(obj1.x0, obj2.x0) y0 = min(obj1.y0, obj2.y0) x1 = max(obj1.x1, obj2.x1) y1 = max(obj1.y1, obj2.y1) objs = set(plane.find((x0, y0, x1, y1))) return objs.difference((obj1, obj2)) # XXX this still takes O(n^2) :( dists = [] for i in xrange(len(boxes)): obj1 = boxes[i] for j in xrange(i+1, len(boxes)): obj2 = boxes[j] dists.append((0, dist(obj1, obj2), obj1, obj2)) dists.sort() plane = Plane(self.bbox) plane.extend(boxes) while dists: (c, d, obj1, obj2) = dists.pop(0) if c == 0 and isany(obj1, obj2): dists.append((1, d, obj1, obj2)) continue if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))): group = LTTextGroupTBRL([obj1, obj2]) else: group = LTTextGroupLRTB([obj1, obj2]) plane.remove(obj1) plane.remove(obj2) # this line is optimized -- don't change without profiling dists = [n for n in dists if n[2] in plane._objs and n[3] in plane._objs] for other in plane: dists.append((0, dist(group, other), group, other)) dists.sort() plane.add(group) assert len(plane) == 1 return list(plane)
def group_textlines(self, laparams, lines): plane = Plane(self.bbox) plane.extend(lines) boxes = {} for line in lines: neighbors = line.find_neighbors(plane, laparams.line_margin) if line not in neighbors: continue members = [] for obj1 in neighbors: members.append(obj1) if obj1 in boxes: members.extend(boxes.pop(obj1)) if isinstance(line, LTTextLineHorizontal): box = LTTextBoxHorizontal() else: box = LTTextBoxVertical() for obj in uniq(members): box.add(obj) boxes[obj] = box done = set() for line in lines: if line not in boxes: continue box = boxes[line] if box in done: continue done.add(box) if not box.is_empty(): yield box return
def group_textlines(self, laparams, lines): plane = Plane(self.bbox) plane.extend(lines) boxes = {} for line in lines: neighbors = line.find_neighbors(plane, laparams.line_margin) if line not in neighbors: continue members = [] for obj1 in neighbors: members.append(obj1) if obj1 in boxes: members.extend(boxes.pop(obj1)) if isinstance(line, LTTextLineHorizontal): box = LTTextBoxHorizontal() else: box = LTTextBoxVertical() for obj in uniq(members): box.add(obj) boxes[obj] = box done = set() for line in lines: if line not in boxes: continue box = boxes[line] if box in done: continue done.add(box) if not box.is_empty(): yield box return