Esempio n. 1
0
    def _parse(self):        
        # the PDFParseContext is a parser for the PDF 'language'
        p = PDFParseContext(self.rawContent)
        p.parse()
        self.context = p

        # after parsing all those dictionaries, we need to do
        # the 'cross referencing' - figuring out what the bits
        # mean.  PageCatcher's model lets us do only the bits needed,
        # but what the heck, do the lot.
        # The 'compilation' object is basically the parsed/resolved/
        # reconstructed document, so we cal it pdfTree
        c = p.compilation
        self.pdfTree = c

        (ind, catinfo) = p.catalog
        if p.encrypt:
            getPdfEncrypt()


        c.sanitizePages(save=("Type", "Contents",
                              "MediaBox", "ArtBox", "BleedBox", "CropBox", "TrimBox",
                              "Resources", "Rotate"))
        #c.sanitizePages()  # this drops info we need
        c.findAllReferences()
        #catalog = \
        c.getReference(catinfo)
        c.doTranslations(catinfo)  # this takes the time!
        c.populatePageList()
        self.pageCount = len(c.pageList)


        # keep a lazy array of parsed page objects
        self._pageForms = [None] * len(c.pageList)
Esempio n. 2
0
    def _parse(self):        
        # the PDFParseContext is a parser for the PDF 'language'
        p = PDFParseContext(self.rawContent)
        p.parse()
        self.context = p

        # after parsing all those dictionaries, we need to do
        # the 'cross referencing' - figuring out what the bits
        # mean.  PageCatcher's model lets us do only the bits needed,
        # but what the heck, do the lot.
        # The 'compilation' object is basically the parsed/resolved/
        # reconstructed document, so we cal it pdfTree
        c = p.compilation
        self.pdfTree = c

        (ind, catinfo) = p.catalog
        if p.encrypt:
            getPdfEncrypt()


        c.sanitizePages(save=("Type", "Contents",
                              "MediaBox", "ArtBox", "BleedBox", "CropBox", "TrimBox",
                              "Resources", "Rotate"))
        #c.sanitizePages()  # this drops info we need
        c.findAllReferences()
        #catalog = \
        c.getReference(catinfo)
        c.doTranslations(catinfo)  # this takes the time!
        c.populatePageList()
        self.pageCount = len(c.pageList)


        # keep a lazy array of parsed page objects
        self._pageForms = [None] * len(c.pageList)
Esempio n. 3
0
def parsedoc(fileName):
    """
    Using PDFParseContext object from Pagecatcher module to check for encryption.
    """
    try:
        from rlextra.pageCatcher.pageCatcher import PDFParseContext
    except ImportError:
        return
    pdfContent = open(fileName, 'rb').read()
    p = PDFParseContext(pdfContent, prefix="PageForms")
    p.parse()
    assert p.encrypt
Esempio n. 4
0
    def _extractAnnotations(self):
        """
        Returns list of annotation dictionaries on page.dict

        Here is what an annotation dictionary looks like:
        { 'F': 4,
          'FT': '/Tx',
          'Rect': [108, 577, 407, 594],
          'Subtype': '/Widget',
          'T': 'AgentName',
          'TU': 'blah again',
          'Type': '/Annot'}"""
        self._annotations = {}
        p = PDFParseContext(self.rawContent)
        p.parse()
        (ind, catinfo) = p.catalog
        c = p.compilation
        c.sanitizePages(save=("Type", "Contents",
                              "MediaBox", "ArtBox", "BleedBox", "CropBox", "TrimBox",
                              "Resources", "Rotate", "Annots"))
        #c.sanitizePages()  # this drops info we need
        c.findAllReferences()
        #catalog = \
        c.getReference(catinfo)
        c.doTranslations(catinfo)  # this takes the time!
        c.populatePageList()
        pageCount = len(c.pageList)

        for pageNo in range(pageCount):
            pageId = c.pageList[pageNo]
            page = c.objects[pageId]
            pageDict = page.dict
            if "Annots" in pageDict:
                rawPageAnnots = c.resolve(pageDict["Annots"])
                friendlyPageAnnots = pythonize(rawPageAnnots, c)

            else:
                friendlyPageAnnots = []
            self._annotations[pageNo] = friendlyPageAnnots
Esempio n. 5
0
    def _extractAnnotations(self):
        """
        Returns list of annotation dictionaries on page.dict

        Here is what an annotation dictionary looks like:
        { 'F': 4,
          'FT': '/Tx',
          'Rect': [108, 577, 407, 594],
          'Subtype': '/Widget',
          'T': 'AgentName',
          'TU': 'blah again',
          'Type': '/Annot'}"""
        self._annotations = {}
        p = PDFParseContext(self.rawContent)
        p.parse()
        (ind, catinfo) = p.catalog
        c = p.compilation
        c.sanitizePages(save=("Type", "Contents",
                              "MediaBox", "ArtBox", "BleedBox", "CropBox", "TrimBox",
                              "Resources", "Rotate", "Annots"))
        #c.sanitizePages()  # this drops info we need
        c.findAllReferences()
        #catalog = \
        c.getReference(catinfo)
        c.doTranslations(catinfo)  # this takes the time!
        c.populatePageList()
        pageCount = len(c.pageList)

        for pageNo in range(pageCount):
            pageId = c.pageList[pageNo]
            page = c.objects[pageId]
            pageDict = page.dict
            if "Annots" in pageDict:
                rawPageAnnots = c.resolve(pageDict["Annots"])
                friendlyPageAnnots = pythonize(rawPageAnnots, c)

            else:
                friendlyPageAnnots = []
            self._annotations[pageNo] = friendlyPageAnnots