def to_tables7(self): lines = [] for t in filter(lambda x: x not in self.multi_col_boxes, self.texts): for line in t: placed = False for l in lines: if (l['contain'].voverlap(line) / l['contain'].height > 0.9) and (l['contain'].voverlap(line) / l['contain'].height < 1.1): l['texts'].append(line) l['contain'].set_bbox( (min(l['contain'].x0, line.x0), l['contain'].y0, max(l['contain'].x1, line.x1), l['contain'].y1)) placed = True if not placed: lines.append({ 'contain': LTComponent(line.bbox), 'texts': [line] }) lines.sort(key=lambda x: x['contain'].y0) for line in lines: line['cells'] = [None] * len(self.finalcols2) for columntext in line['texts']: for i, column in enumerate(self.finalcols2): if column['contain'].hoverlap(columntext): if line['cells'][i] is None: line['cells'][i] = columntext break else: # seems like the parser library sometimes duplicates text, possible bug pass self.lines = lines
def __init__(self, annoObj, uri, pos, pageid): self.origObjs = [annoObj] self.gotoLoc = uri self.assocText = [""] self.assocTextIn = 0 self.positions = [[LTComponent(pos), pageid]] self.destPage = None self.unparseCite = "" self.finalCiteStr = "" self.papObj = None self.papLink = "" self.author = "" self.year = ""
def to_tables3(self): colgroups = [] for c in self.columns: colg = None for d in colgroups: if c is not d: if all(c['contain'].is_hoverlap(e['contain']) for e in d['cols']): d['cols'].append(c) colg = d break if not colg: colgroups.append({ 'contain': LTComponent(c['contain'].bbox), 'cols': [c] }) logger.info('{x} colgroups'.format(x=len(colgroups))) self.colgroups = colgroups
def given_plane_with_one_object(object_size=50, gridsize=50): bounding_box = (0, 0, 100, 100) plane = Plane(bounding_box, gridsize) obj = LTComponent((0, 0, object_size, object_size)) plane.add(obj) return plane, obj
def to_tables2(self): columns = [] # texts.sort(key=lambda x: x.width) for e in self.layout: if isinstance(e, LTTextBoxHorizontal): logger.info('Finding a column for box {i}'.format(i=e.index)) ##im2 = im.copy() ##d = ImageDraw.Draw(im2) col = None for c in columns: if (e.x1 < c['contain'].x1) and (e.x0 > c['contain'].x0): if (e.width / c['contain'].width) < 0.8: logger.info( 'Item too small, column may be several columns wide' ) else: logger.info('Item totally contained in column') logger.info( '{ex1} < {cx1} and {ex0} > {cx0}'.format( ex1=e.x1, cx1=c['contain'].x1, ex0=e.x0, cx0=c['contain'].x0)) col = c col['boxes'].append(e) col['contain'].set_bbox( (c['contain'].x0, min(c['contain'].y0, e.y0), c['contain'].x1, max(c['contain'].y1, e.y1))) ##d.rectangle([int(c['contain'].bbox[0] * scale_factor), h - int(c['contain'].bbox[3] * scale_factor), int(c['contain'].bbox[2] * scale_factor), h - int(c['contain'].bbox[1] * scale_factor)], fill=None, outline=c['color'], width=5 * scale_factor) break elif ((c['contain'].hoverlap(e) / c['contain'].width) > 0.9) and ((c['contain'].hoverlap(e) / c['contain'].width) < 1.1): logger.info('Item is within 10% of current col width') logger.info( 'Overlap of {hdist}, column width of {width}'. format(hdist=c['contain'].hoverlap(e), width=c['contain'].width)) col = c col['boxes'].append(e) col['contain'].set_bbox( (min(c['contain'].x0, e.x0), min(c['contain'].y0, e.y0), max(c['contain'].x1, e.x1), max(c['contain'].y1, e.y1))) ##d.rectangle([int(c['contain'].bbox[0] * scale_factor), h - int(c['contain'].bbox[3] * scale_factor), int(c['contain'].bbox[2] * scale_factor), h - int(c['contain'].bbox[1] * scale_factor)], fill=None, outline=c['color'], width=5 * scale_factor) break if not col: logger.info('Creating new column') col = { 'contain': LTComponent(e.bbox), 'boxes': list(e), 'color': random.choice(list(PIL.ImageColor.colormap.keys())) } columns.append(col) columns.sort(key=lambda x: x['contain'].width) ##d.rectangle([int(col['contain'].bbox[0] * scale_factor), h - int(col['contain'].bbox[3] * scale_factor), int(col['contain'].bbox[2] * scale_factor), h - int(col['contain'].bbox[1] * scale_factor)], fill=None, outline=col['color'], width=5 * scale_factor) ##im2.save('page0.{x}.png'.format(x=e.index), "PNG") self.columns = columns
def addPosition(self, pos, pageid): self.positions.append([LTComponent(pos), pageid])
def _create_cells(network): """ Creates cells from the network and returns then as LTComponents. """ squares_taken = defaultdict(set) cells = set() def city_distance(point, point_prime): return abs(point.x - point_prime.x) + abs(point.y - point_prime.y) def is_perpendicular(v1_x, v1_y, v2_x, v2_y): return v1_x*v2_x + v1_y*v2_y == 0 for point in sorted(network, key=lambda p: (p.x, p.y)): for l1 in sorted(network.links[point], key=lambda p: city_distance(p, point)): valid_links = [ link for link in network.links[point] if link != l1 and is_perpendicular(link.x - point.x, link.y - point.y, l1.x - point.x, l1.y - point.y)] for l2 in sorted(valid_links, key=lambda p: city_distance(p, point)): inter = network.links[l2].intersection(network.links[l1]) intersection = list(inter) # remove initial point intersection.remove(point) if len(intersection) == 0: continue # sort by areas: smallest area first area = lambda p: (p.x - point.x)*(p.y - point.y) intersection.sort(key=area) # square is formed by [point, l1, l2, last_point], in this # order. points = [point, l1, l2, intersection[0]] # compute middle position of the square middle_x = sum(point.x for point in points)/4. middle_y = sum(point.y for point in points)/4. # check if any point already has one of its squares # (at most 4) used. is_taken = False square = range(4) for i in range(4): # compute the position of the point in relation to the # middle corresponding to one of the following squares # position: [(1,1), (-1,1), (1,-1), (-1,-1)] vx = middle_x - points[i].x vy = middle_y - points[i].y square[i] = (int(vx/abs(vx)), int(vy/abs(vy))) belongs = square[i] in squares_taken[points[i]] is_taken = is_taken or belongs if not is_taken: cell = LTComponent((point.x, point.y, intersection[0].x, intersection[0].y)) cells.add(cell) for i in range(4): squares_taken[points[i]].add(square[i]) break return cells