Example #1
0
    def parse_document(self):
        """
        Parse the document and prepare the internal PDFLoc-decoding structures.

        This function is only meant to be called once.

        Only close the document parser's source stream after calling this method.

        :raises RuntimeError: If this function is called more than once.
        :raises RuntimeError: If the document parser's source stream has already been closed.
        """

        if self.is_document_parsed():
            raise RuntimeError("parse_document can only be called once.")

        la = LAParams()
        rm = PDFResourceManager()
        dev = PDFLocPageAnalyzer(rm, laparams=la)
        interp = PDFLocInterpreter(rm, dev)
        dev.set_interpreter(interp)

        self._navigation_tree = NavigationTree()
        self._pdfloc_document = PDFLocDocument()

        for (pageno, page) in enumerate(PDFPage.create_pages(self._pdf_document)):

            if self._only_pages is not None and pageno not in self._only_pages:
                continue

            interp.process_page(page)

            self._navigation_tree[pageno] = dev.coords_to_chars
            self._pdfloc_document.add(dev.get_result())

            print "Page no. %i contains %i keywords" % (pageno, interp.keyword_count)

        # if we opened the source file, close it now, because we no longer need it
        if self.__source_file_handle is not None and not self.__source_file_handle.closed:
            self.__source_file_handle.close()
Example #2
0
class PDFLocConverter(object):
    def __init__(self, document, pdflocs=[], bboxes=[]):
        """
        Initialize the converter with the given document.

        If either pdflocs od bboxes are given, only their corresponding document
        pages will be parsed in the call to parse_document(). This is done to save
        CPU time and resources needed for parsing if only a small part of the document
        is needed.

        :param document: Either a prepared PDFDocument, open file, or a string denoting a filename.
                         If a PDFDocument is given, the underlying parser's source stream
                            needs to be open for reading/seeking until parse_document()
                            is called.
                         If an open file is given, the PDFDocument is created here internally,
                            and the file is closed as soon as parse_document() finishes.
                         If a filename is given, the PDFDocument is created here internally,
                            and the stream is closed as soon as parse_document() finishes.
        :type document: PDFDocument | basestring

        :param pdflocs: A list of PDFLocs of interest - only pages corresponding to them are
                            to be parsed.
        :type pdflocs: list

        :param bboxes: A list of bounding boxes of interest - only pages corresponding to
                            them are to be parsed.
        :type bboxes: list
        """
        super(PDFLocConverter, self).__init__()

        # if document is given as a filename and we open it automatically here,
        # we need to remember the file handle to close it when the document is parsed
        self.__source_file_handle = None

        if isinstance(document, PDFDocument):
            self._pdf_document = document
        elif isinstance(document, basestring):
            self.__source_file_handle = file(document, 'rb')
            parser = PDFParser(self.__source_file_handle)
            self._pdf_document = PDFDocument(parser)
        elif type(document) == file:
            self.__source_file_handle = document
            parser = PDFParser(self.__source_file_handle)
            self._pdf_document = PDFDocument(parser)
        else:
            raise ValueError("Unsupported PDF document argument given: %s" % repr(document))

        self._pdfloc_document = None
        self._only_pages = None
        self._navigation_tree = None

        self.restrict_only_on_pages_from(pdflocs, bboxes)

    def restrict_only_on_pages_from(self, pdflocs=[], bboxes=[], only_pages=set()):
        """
        Restrict parse_document() to only parse pages corresponding to the given PDFLocs'
        or boundingboxes' pages.

        If some restrictions were given in the constructor, this call will rewrite them.

        :param pdflocs: A list of PDFLocs of interest - only pages corresponding to them are
                            to be parsed.
        :type pdflocs: list

        :param bboxes: A list of bounding boxes of interest - only pages corresponding to
                            them are to be parsed.
        :type bboxes: list

        :param only_pages: The basic set of pages that are always parsed.
        :type only_pages: set

        :raises RuntimeError: If the document has already been parsed.
        """
        if self.is_document_parsed():
            raise RuntimeError("Cannot restrict pages in an already parsed document.")

        self._only_pages = set(only_pages)
        for pdfloc in pdflocs:
            if isinstance(pdfloc, PDFLoc):
                self._only_pages.add(pdfloc.page)
            elif isinstance(pdfloc, PDFLocPair):
                self._only_pages.update(pdfloc.pages_covered)

        for bbox in bboxes:
            if isinstance(bbox, PDFLocBoundingBoxes):
                self._only_pages.update(bbox.pages_covered)

        if len(self._only_pages) == 0:
            self._only_pages = None

    def is_document_parsed(self):
        """
        Return true if the document has already been parsed.
        :return: true if the document has already been parsed.
        :rtype bool:
        """
        return self._pdfloc_document is not None

    def parse_document(self):
        """
        Parse the document and prepare the internal PDFLoc-decoding structures.

        This function is only meant to be called once.

        Only close the document parser's source stream after calling this method.

        :raises RuntimeError: If this function is called more than once.
        :raises RuntimeError: If the document parser's source stream has already been closed.
        """

        if self.is_document_parsed():
            raise RuntimeError("parse_document can only be called once.")

        la = LAParams()
        rm = PDFResourceManager()
        dev = PDFLocPageAnalyzer(rm, laparams=la)
        interp = PDFLocInterpreter(rm, dev)
        dev.set_interpreter(interp)

        self._navigation_tree = NavigationTree()
        self._pdfloc_document = PDFLocDocument()

        for (pageno, page) in enumerate(PDFPage.create_pages(self._pdf_document)):

            if self._only_pages is not None and pageno not in self._only_pages:
                continue

            interp.process_page(page)

            self._navigation_tree[pageno] = dev.coords_to_chars
            self._pdfloc_document.add(dev.get_result())

            print "Page no. %i contains %i keywords" % (pageno, interp.keyword_count)

        # if we opened the source file, close it now, because we no longer need it
        if self.__source_file_handle is not None and not self.__source_file_handle.closed:
            self.__source_file_handle.close()

        # assert objs_per_page[0][73][0:2] == ["w","ork"]
        # assert objs_per_page[0][79][0] == "in"
        # assert objs_per_page[1][336][0] == "that"
        # assert objs_per_page[5][1296][0] == "Kno"
        # assert objs_per_page[6][400][0] == "solution"
        # assert objs_per_page[4][1278][0] == "A."
        # assert objs_per_page[3][2961][0:2] == [".", "F"]

    def pdfloc_pair_to_bboxes(self, pdfloc_pair):
        assert isinstance(pdfloc_pair, PDFLocPair)

        start_char = self._navigation_tree.find_layout_char(pdfloc_pair.start)
        end_char = self._navigation_tree.find_layout_char(pdfloc_pair.end)

        bboxes = self._pdfloc_document.find_bboxes_between_chars(start_char, end_char)
        return bboxes

    def pdfloc_to_xy(self, pdfloc):
        char = self._navigation_tree.find_layout_char(pdfloc)
        return self._pdfloc_document.find_bbox_for_char(char)

    def bboxes_to_pdfloc_pair(self, bboxes):
        pass  #TODO

    def xy_to_pdfloc(self, xy):
        pass  #TODO

    @staticmethod
    def pdflocs_to_bboxes(document, pdfloc_strings):
        """
        Parse the given document and return a list of bounding boxes corresponding to the
        given list of PDFLocs.

        :param document: Either a prepared PDFDocument, or a string denoting a filename.
        :type document: PDFDocument | basestring

        :param pdfloc_strings: A list of pairs (tuples) of strings with the PDFLocs.
        :type pdfloc_strings: list

        :return: The corresponding bounding boxes. There is a list of boundingboxes
                    corresponing to each one PDFLoc. Indices in the returned list
                    correspond to the order in which pdfloc_strings are iterated.
        :rtype list:
        """
        pdflocs = []
        for pdfloc_string in pdfloc_strings:
            if isinstance(pdfloc_string, tuple) and len(pdfloc_string) == 2:
                (start, end) = pdfloc_string
                pdflocs.append(PDFLocPair(start, end))
            else:
                print "Warning: ignoring pdfloc string '%s'" % pdfloc_string

        converter = PDFLocConverter(document, pdflocs)
        converter.parse_document()

        result = []
        for query in pdflocs:
            if isinstance(query, PDFLocPair):
                bboxes = converter.pdfloc_pair_to_bboxes(query)
                result.append(bboxes)

        return result