def group_lines(layout, pts_thres=4.0): """ Find columns and row_bboxes from line segments TODO: combine line-based detection with clustering of alignments """ segments = [] curves = [] # Group segments shifted in parallel, allow for small mismatch # caused by formatting. # Not using sorting because it is similar to clustering without # well-ordering of segments. # This is the C version, use AVLTree for Python compatibility. h_segs_by_x = FastAVLTree() v_segs_by_y = FastAVLTree() # Analyzes the pdf for line regions that could potentially # contain a table def process_segment_func(e): if type(e) is LTCurve: curves.append(e) # Only keep lines here if isinstance(e, LTLine) and max(e.width, e.height) > pts_thres: segments.append(e) group_segs(e, h_segs_by_x, v_segs_by_y, pts_thres) # Recursively traverse the PDF document tree and apply func traverse_layout(layout, process_segment_func) # Segments grouped and sorted into rows/cols row_group = sorted_groups(v_segs_by_y, group_key=lambda l: l.x0) col_group = sorted_groups(h_segs_by_x, group_key=lambda l: l.y0) # Now group rows/cols into tables rows_by_x0 = FastAVLTree() cols_by_y0 = FastAVLTree() def seg_close(s1, s2): return segment_diff(s1, s2) < pts_thres for row_bbox, row_segs in row_group: bbox_key = (row_bbox[x0], row_bbox[x1]) align_add_to_tree(rows_by_x0, bbox_key, row_bbox, seg_close) for col_bbox, col_segs in col_group: bbox_key = (col_bbox[y0], col_bbox[y1]) align_add_to_tree(cols_by_y0, bbox_key, col_bbox, seg_close) # Extract bbox of potential tables row_major_tables = [bound_bboxes(rows) for rows in rows_by_x0.values()] col_major_tables = [bound_bboxes(cols) for cols in cols_by_y0.values()] # Filter non-tables and consolidate duplicates tables = row_major_tables + col_major_tables table_proto = (["x0", "x1", "xn"], ["y0", "y1", "y2", "yn"]) # find non-overlapping columns and output those as tables # store line locations so that we can check # if a line exists betwen text lines # For debugging: # tables = row_bboxes = [b for b,_ in row_group] return segments, curves, tables
class BinTreeIndex(Index): ''' Binary tree to index high cardinality fields. Uses bintrees package: https://pypi.python.org/pypi/bintrees/2.0.2 We use a set of values for each key, to allow multiple (but unique) values ''' def __init__(self, field, directory): ''' Initializes the BinTreeIndex class. Parameters ---------- field : str The metadata field name that the index represents directory : str The directory location where the index file will be saved Returns ------- An initialized BinTreeIndex object ''' # initialize index properties self.field = field self.directory = directory self.file = self.directory + self.field + '.idx' # load if already present if os.path.exists(self.file): with open(self.file, "rb", buffering=0) as fd: self.index = pickle.load(fd) # otherwise initialize else: self.index = FastAVLTree() def add_key(self, key): ''' Adds a new index key (i.e. possible metadata field value) and initializes as empty (i.e. primary keys associated with it). Parameters ---------- key : str The metadata field value Returns ------- Nothing, modifies in-place. ''' # initialize new field index as an empty set # will contain all pks that match this value self.index[key] = set() def add_pk(self, key, pk): ''' Adds a primary key to an index key (i.e. metadata field value). Parameters ---------- key : str The metadata field value pk : str Primary key identifier Returns ------- Nothing, modifies in-place. ''' self.index[key].add(pk) def remove_pk(self, key, pk): ''' Removes a primary key from an index key (i.e. metadata field value). Parameters ---------- key : str The metadata field value pk : str Primary key identifier Returns ------- Nothing, modifies in-place. ''' self.index[key].discard(pk) # clear key if no further primary keys left if len(self.index[key]) == 0: self.remove_key(key) def keys(self): ''' Returns the index keys (i.e. possible metadata values). Parameters ---------- None Returns ------- List of index keys. ''' return list(self.index.keys()) def values(self): ''' Returns the index values (i.e. primary keys associated with metadata). Parameters ---------- None Returns ------- List of index values. ''' return list(self.index.values()) def items(self): ''' Returns the index items (i.e. possible metadata values, and the primary keys associated with each of them). Parameters ---------- None Returns ------- List of index items. ''' return list(self.index.items())
class PriorityQueue(object): """ Combined priority queue and set data structure. Acts like a priority queue, except that its items are guaranteed to be unique. Provides O(1) membership test, O(log N) insertion and O(log N) removal of the smallest item. Important: the items of this data structure must be both comparable and hashable (i.e. must implement __cmp__ and __hash__). This is true of Python's built-in objects, but you should implement those methods if you want to use the data structure for custom objects. """ def __init__(self, items=[], key = None , maxitems=None, maxkey=None): """ Create a new PriorityQueueSet. items: An initial item list - it can be unsorted and non-unique. The data structure will be created in O(N). """ if key == None: self.key=lambda x: x else: self.key=key self.tree = FastAVLTree() #self.tree = AVLTree() self.maxitems = maxitems self.maxkey = maxkey for x in items: self.add(x) def has_item(self, item): """ Check if *item* exists in the queue """ return bool(self.tree.get(self.key(item), False)) def pop_smallest(self): return self.tree.pop_min() def peek(self, d = None): try: return self.tree.min_item()[1] except: return d def __setitem__(self, key, value): self.tree[self.key(key)]=value def __getitem__(self, item): return self.tree[self.key(item)] # updateing by removing and reinserting # i cant find a anode by object ?? # i hate your data structures ... index in O(n) :( def update(self, item): itemsbykey = self.tree[self.key(item):self.key(item)] del self.tree[self.key(item):self.key(item)] for x in itemsbykey: #if not (x is item): self.add(x) def add(self, item): """ Add *item* to the queue. The item will be added only if it doesn't already exist in the queue. """ #print "PriorityQue add " + str(item) if self.maxkey and self.key(item) > self.maxkey: return if self.tree.get(self.key(item), None) is None: self.tree[self.key(item)]=item # sholdnt it be pop biggest??? [yes we need a tree] if self.maxitems and self.tree.__len__() > self.maxitems: self.tree.pop_max() #print "PriorityQue add peek " + str(self.peek()) def prettyprint(self): pp = operator.methodcaller('prettyprint') return "".join(map(pp,self.tree.values())) """