def group_lines(layout, pts_thres=4.0):
    """
    Find columns and row_bboxes from line segments
    TODO: combine line-based detection with clustering of alignments
    """
    segments = []
    curves = []

    # Group segments shifted in parallel, allow for small mismatch
    # caused by formatting.
    # Not using sorting because it is similar to clustering without
    # well-ordering of segments.
    # This is the C version, use AVLTree for Python compatibility.
    h_segs_by_x = FastAVLTree()
    v_segs_by_y = FastAVLTree()

    # Analyzes the pdf for line regions that could potentially
    # contain a table
    def process_segment_func(e):
        if type(e) is LTCurve:
            curves.append(e)

        # Only keep lines here
        if isinstance(e, LTLine) and max(e.width, e.height) > pts_thres:
            segments.append(e)
            group_segs(e, h_segs_by_x, v_segs_by_y, pts_thres)

    # Recursively traverse the PDF document tree and apply func
    traverse_layout(layout, process_segment_func)

    # Segments grouped and sorted into rows/cols
    row_group = sorted_groups(v_segs_by_y, group_key=lambda l: l.x0)
    col_group = sorted_groups(h_segs_by_x, group_key=lambda l: l.y0)

    # Now group rows/cols into tables
    rows_by_x0 = FastAVLTree()
    cols_by_y0 = FastAVLTree()

    def seg_close(s1, s2):
        return segment_diff(s1, s2) < pts_thres

    for row_bbox, row_segs in row_group:
        bbox_key = (row_bbox[x0], row_bbox[x1])
        align_add_to_tree(rows_by_x0, bbox_key, row_bbox, seg_close)

    for col_bbox, col_segs in col_group:
        bbox_key = (col_bbox[y0], col_bbox[y1])
        align_add_to_tree(cols_by_y0, bbox_key, col_bbox, seg_close)

    # Extract bbox of potential tables
    row_major_tables = [bound_bboxes(rows) for rows in rows_by_x0.values()]
    col_major_tables = [bound_bboxes(cols) for cols in cols_by_y0.values()]

    # Filter non-tables and consolidate duplicates

    tables = row_major_tables + col_major_tables
    table_proto = (["x0", "x1", "xn"], ["y0", "y1", "y2", "yn"])

    # find non-overlapping columns and output those as tables

    # store line locations so that we can check
    # if a line exists betwen text lines

    # For debugging:
    # tables = row_bboxes = [b for b,_ in row_group]
    return segments, curves, tables
예제 #2
0
class BinTreeIndex(Index):
    '''
    Binary tree to index high cardinality fields.
    Uses bintrees package: https://pypi.python.org/pypi/bintrees/2.0.2
    We use a set of values for each key, to allow multiple (but unique) values
    '''

    def __init__(self, field, directory):
        '''
        Initializes the BinTreeIndex class.

        Parameters
        ----------
        field : str
            The metadata field name that the index represents
        directory : str
            The directory location where the index file will be saved

        Returns
        -------
        An initialized BinTreeIndex object
        '''

        # initialize index properties
        self.field = field
        self.directory = directory
        self.file = self.directory + self.field + '.idx'

        # load if already present
        if os.path.exists(self.file):
            with open(self.file, "rb", buffering=0) as fd:
                self.index = pickle.load(fd)

        # otherwise initialize
        else:
            self.index = FastAVLTree()

    def add_key(self, key):
        '''
        Adds a new index key (i.e. possible metadata field value) and
        initializes as empty (i.e. primary keys associated with it).

        Parameters
        ----------
        key : str
            The metadata field value

        Returns
        -------
        Nothing, modifies in-place.
        '''
        # initialize new field index as an empty set
        # will contain all pks that match this value
        self.index[key] = set()

    def add_pk(self, key, pk):
        '''
        Adds a primary key to an index key (i.e. metadata field value).

        Parameters
        ----------
        key : str
            The metadata field value
        pk : str
            Primary key identifier

        Returns
        -------
        Nothing, modifies in-place.
        '''
        self.index[key].add(pk)

    def remove_pk(self, key, pk):
        '''
        Removes a primary key from an index key (i.e. metadata field value).

        Parameters
        ----------
        key : str
            The metadata field value
        pk : str
            Primary key identifier

        Returns
        -------
        Nothing, modifies in-place.
        '''
        self.index[key].discard(pk)

        # clear key if no further primary keys left
        if len(self.index[key]) == 0:
            self.remove_key(key)

    def keys(self):
        '''
        Returns the index keys (i.e. possible metadata values).

        Parameters
        ----------
        None

        Returns
        -------
        List of index keys.
        '''
        return list(self.index.keys())

    def values(self):
        '''
        Returns the index values (i.e. primary keys associated with metadata).

        Parameters
        ----------
        None

        Returns
        -------
        List of index values.
        '''
        return list(self.index.values())

    def items(self):
        '''
        Returns the index items (i.e. possible metadata values, and the
        primary keys associated with each of them).

        Parameters
        ----------
        None

        Returns
        -------
        List of index items.
        '''
        return list(self.index.items())
class PriorityQueue(object):
    """ Combined priority queue and set data structure. Acts like
        a priority queue, except that its items are guaranteed to
        be unique.

        Provides O(1) membership test, O(log N) insertion and
        O(log N) removal of the smallest item.

        Important: the items of this data structure must be both
        comparable and hashable (i.e. must implement __cmp__ and
        __hash__). This is true of Python's built-in objects, but
        you should implement those methods if you want to use
        the data structure for custom objects.
    """
    def __init__(self, items=[], key = None , maxitems=None, maxkey=None):
        """ Create a new PriorityQueueSet.

            items:
                An initial item list - it can be unsorted and
                non-unique. The data structure will be created in
                O(N).
        """
        if key == None:
            self.key=lambda  x: x
        else:
            self.key=key

        self.tree = FastAVLTree()
        #self.tree = AVLTree()

        self.maxitems = maxitems
        self.maxkey = maxkey

        for x in items:
            self.add(x)



    def has_item(self, item):
        """ Check if *item* exists in the queue
        """
        return bool(self.tree.get(self.key(item), False))

    def pop_smallest(self):
        return self.tree.pop_min()


    def peek(self, d = None):
        try:
            return self.tree.min_item()[1]
        except:
            return d

    def __setitem__(self, key, value):
        self.tree[self.key(key)]=value

    def __getitem__(self, item):
        return self.tree[self.key(item)]


    # updateing by removing and reinserting
    # i cant find a anode by object ??
    # i hate your data structures ... index in O(n) :(
    def update(self, item):
        itemsbykey = self.tree[self.key(item):self.key(item)]
        del self.tree[self.key(item):self.key(item)]
        for x in itemsbykey:
            #if not (x is item):
            self.add(x)



    def add(self, item):
        """ Add *item* to the queue. The item will be added only
            if it doesn't already exist in the queue.
        """
        #print "PriorityQue add  " + str(item)
        if self.maxkey and self.key(item) > self.maxkey:
            return

        if self.tree.get(self.key(item), None) is None:
            self.tree[self.key(item)]=item

        # sholdnt it be pop biggest??? [yes we need a tree]
        if self.maxitems and self.tree.__len__() > self.maxitems:
            self.tree.pop_max()

        #print "PriorityQue add peek " + str(self.peek())

    def prettyprint(self):
        pp = operator.methodcaller('prettyprint')
        return "".join(map(pp,self.tree.values()))

    """