コード例 #1
0
def extractembedded(outfp,
                    fp,
                    objids,
                    pagenos,
                    password='',
                    dumpall=False,
                    codec=None):
    doc = PDFDocument()
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)

    cwd = os.path.normpath(os.getcwd()) + '/'
    for xref in doc.xrefs:
        for objid in xref.get_objids():
            obj = doc.getobj(objid)
            if isinstance(obj, dict):
                objtype = obj.get('Type', '')
                if isinstance(objtype,
                              PSLiteral) and objtype.name == 'Filespec':
                    filename = obj['UF'] or obj['F']
                    fileref = obj['EF']['F']
                    fileobj = doc.getobj(fileref.objid)
                    if not isinstance(fileobj, PDFStream):
                        raise Exception(
                            "unable to process PDF: reference for %s is not a PDFStream"
                            % filename)
                    if not isinstance(
                            fileobj['Type'], PSLiteral
                    ) or not fileobj['Type'].name == 'EmbeddedFile':
                        raise Exception(
                            "unable to process PDF: reference for %s is not an EmbeddedFile"
                            % filename)

                    print("extracting", filename)
                    absfilename = os.path.normpath(os.path.abspath(filename))
                    if not absfilename.startswith(cwd):
                        raise Exception(
                            "filename %s is trying to escape to parent directories."
                            % filename)

                    dirname = os.path.dirname(absfilename)
                    if not os.path.isdir(dirname):
                        os.makedirs(dirname)

                    # don't overwrite anything
                    fd = os.open(absfilename,
                                 os.O_WRONLY | os.O_CREAT | os.O_EXCL)
                    f = os.fdopen(fd, 'wb')
                    f.write(fileobj.get_data())
                    f.close()
コード例 #2
0
def dumppdf(outfp,
            fname,
            objids,
            pagenos,
            password='',
            dumpall=False,
            codec=None):
    doc = PDFDocument()
    fp = open(fname, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)
    if objids:
        for objid in objids:
            obj = doc.getobj(objid)
            dumpxml(outfp, obj, codec=codec)
    if pagenos:
        for (pageno, page) in enumerate(doc.get_pages()):
            if pageno in pagenos:
                if codec:
                    for obj in page.contents:
                        obj = stream_value(obj)
                        dumpxml(outfp, obj, codec=codec)
                else:
                    dumpxml(outfp, page.attrs)
    if dumpall:
        dumpallobjs(outfp, doc, codec=codec)
    if (not objids) and (not pagenos) and (not dumpall):
        dumptrailers(outfp, doc)
    fp.close()
    if codec not in ('raw', 'binary'):
        outfp.write('\n')
コード例 #3
0
ファイル: dumppdf.py プロジェクト: Adniel/ComparePdf
def dumppdf(outfp, fname, objids, pagenos, password='',
            dumpall=False, codec=None):
    doc = PDFDocument()
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)
    if objids:
        for objid in objids:
            obj = doc.getobj(objid)
            dumpxml(outfp, obj, codec=codec)
    if pagenos:
        for (pageno,page) in enumerate(doc.get_pages()):
            if pageno in pagenos:
                if codec:
                    for obj in page.contents:
                        obj = stream_value(obj)
                        dumpxml(outfp, obj, codec=codec)
                else:
                    dumpxml(outfp, page.attrs)
    if dumpall:
        dumpallobjs(outfp, doc, codec=codec)
    if (not objids) and (not pagenos) and (not dumpall):
        dumptrailers(outfp, doc)
    fp.close()
    if codec not in ('raw','binary'):
        outfp.write('\n')
    return
コード例 #4
0
ファイル: dumppdf.py プロジェクト: frid/PythonPool
def dumppdf(outfp, fname, objids, pagenos, password='',
            dumpall=False, codec=None):
  doc = PDFDocument()
  fp = file(fname, 'rb')
  parser = PDFParser(doc, fp)
  doc.initialize(password)
  if objids:
    for objid in objids:
      obj = doc.getobj(objid)
      if isinstance(obj, PDFStream) and codec == 'raw':
        outfp.write(obj.get_rawdata())
      elif isinstance(obj, PDFStream) and codec == 'binary':
        outfp.write(obj.get_data())
      else:
        dumpxml(outfp, obj, codec=codec)
  if pagenos:
    for (pageno,page) in enumerate(doc.get_pages()):
      if pageno in pagenos:
        dumpxml(outfp, page.attrs)
  if dumpall:
    dumpallobjs(outfp, doc, codec=codec)
  if (not objids) and (not pagenos) and (not dumpall):
    dumptrailers(outfp, doc)
  fp.close()
  if codec not in ('raw','binary'):
    outfp.write('\n')
  return
コード例 #5
0
ファイル: dumppdf.py プロジェクト: frid/PythonPool
def dumppdf(outfp,
            fname,
            objids,
            pagenos,
            password='',
            dumpall=False,
            codec=None):
    doc = PDFDocument()
    fp = file(fname, 'rb')
    parser = PDFParser(doc, fp)
    doc.initialize(password)
    if objids:
        for objid in objids:
            obj = doc.getobj(objid)
            if isinstance(obj, PDFStream) and codec == 'raw':
                outfp.write(obj.get_rawdata())
            elif isinstance(obj, PDFStream) and codec == 'binary':
                outfp.write(obj.get_data())
            else:
                dumpxml(outfp, obj, codec=codec)
    if pagenos:
        for (pageno, page) in enumerate(doc.get_pages()):
            if pageno in pagenos:
                dumpxml(outfp, page.attrs)
    if dumpall:
        dumpallobjs(outfp, doc, codec=codec)
    if (not objids) and (not pagenos) and (not dumpall):
        dumptrailers(outfp, doc)
    fp.close()
    if codec not in ('raw', 'binary'):
        outfp.write('\n')
    return
コード例 #6
0
def MapFactory(map_path):
    try:
        map_file = file(map_path, "rb")
    except:
        return None

    document = PDFDocument()

    try:
        parser = PDFParser(map_file)
        parser.set_document(document)
        document.set_parser(parser)
        document.initialize("")
    except:
        return None

    obj = document.getobj(_PDF_OBJ_INDEX_)
    if not obj or not isinstance(obj, PDFStream):
        return None

    if not "Width" in obj:
        return None
    if not "Height" in obj:
        return None
    if not "ColorSpace" in obj:
        return None

    width = obj["Width"]
    height = obj["Height"]
    map_class = None

    weird_pdf = height == 1

    data = None
    if weird_pdf:
        data, height = _ProcessWeirdPDF(document)
    else:
        data = obj.get_data()

    if (width == MapA4Portrait.WIDTH and height == MapA4Portrait.HEIGHT):
        map_class = MapA4Portrait
    elif (width == MapA4Landscape.WIDTH and height == MapA4Landscape.HEIGHT):
        map_class = MapA4Landscape
    elif (width == MapA3Portrait.WIDTH and height == MapA3Portrait.HEIGHT):
        map_class = MapA3Portrait
    elif (width == MapA3Landscape.WIDTH and height == MapA3Landscape.HEIGHT):
        map_class = MapA3Landscape
    elif (width == MapA2Portrait.WIDTH and height == MapA2Portrait.HEIGHT):
        map_class = MapA2Portrait
    elif (width == MapA2Landscape.WIDTH and height == MapA2Landscape.HEIGHT):
        map_class = MapA2Landscape
    elif (width == MapA1Portrait.WIDTH and height == MapA1Portrait.HEIGHT):
        map_class = MapA1Portrait
    elif (width == MapA1Landscape.WIDTH and height == MapA1Landscape.HEIGHT):
        map_class = MapA1Landscape
    else:
        return None

    return map_class(_MakePPMImage(width, height, data), map_path)
コード例 #7
0
ファイル: Map.py プロジェクト: drott/IBGETools
def MapFactory(map_path):
    try:
        map_file = file(map_path, "rb")
    except:
        return None

    document = PDFDocument()

    try:
        parser = PDFParser(map_file)
        parser.set_document(document)
        document.set_parser(parser)
        document.initialize("")
    except:
        return None

    # The image object on all IBGE PDFs is indexed
    # at ID 6. We also probe for a few properties.
    obj = document.getobj(6)
    if not obj or not isinstance(obj, PDFStream):
        return None

    if not "Width" in obj:
        return None
    if not "Height" in obj:
        return None
    if not "ColorSpace" in obj:
        return None

    width = obj["Width"]
    height = obj["Height"]
    map_class = None

    if (width == MapA4Portrait.WIDTH and height == MapA4Portrait.HEIGHT):
        map_class = MapA4Portrait
    elif (width == MapA4Landscape.WIDTH and height == MapA4Landscape.HEIGHT):
        map_class = MapA4Landscape
    elif (width == MapA3Portrait.WIDTH and height == MapA3Portrait.HEIGHT):
        map_class = MapA3Portrait
    elif (width == MapA3Landscape.WIDTH and height == MapA3Landscape.HEIGHT):
        map_class = MapA3Landscape
    elif (width == MapA2Portrait.WIDTH and height == MapA2Portrait.HEIGHT):
        map_class = MapA2Portrait
    elif (width == MapA2Landscape.WIDTH and height == MapA2Landscape.HEIGHT):
        map_class = MapA2Landscape
    elif (width == MapA1Portrait.WIDTH and height == MapA1Portrait.HEIGHT):
        map_class = MapA1Portrait
    elif (width == MapA1Landscape.WIDTH and height == MapA1Landscape.HEIGHT):
        map_class = MapA1Landscape
    else:
        return None

    return map_class(_MakePPMImage(width, height, obj.get_data()), map_path)
コード例 #8
0
ファイル: dumppdf.py プロジェクト: eug48/pdfminer
def extractembedded(outfp, fname, objids, pagenos, password='',
                dumpall=False, codec=None):
    doc = PDFDocument()
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)

    cwd = os.path.normpath(os.getcwd()) + '/'
    for xref in doc.xrefs:
        for objid in xref.get_objids():
            obj = doc.getobj(objid)
            if isinstance(obj, dict):
                objtype = obj.get('Type', '')
                if isinstance(objtype, PSLiteral) and objtype.name == 'Filespec':
                    filename = obj['UF'] or obj['F']
                    fileref = obj['EF']['F']
                    fileobj = doc.getobj(fileref.objid)
                    if not isinstance(fileobj, PDFStream):
                        raise Exception("unable to process PDF: reference for %s is not a PDFStream" % (filename))
                    if not isinstance(fileobj['Type'], PSLiteral) or not fileobj['Type'].name == 'EmbeddedFile':
                        raise Exception("unable to process PDF: reference for %s is not an EmbeddedFile" % (filename))

                    print "extracting", filename
                    absfilename = os.path.normpath(os.path.abspath(filename))
                    if not absfilename.startswith(cwd):
                        raise Exception("filename %s is trying to escape to parent directories.." % (filename))

                    dirname = os.path.dirname(absfilename)
                    if not os.path.isdir(dirname):
                        os.makedirs(dirname)

                    # don't overwrite anything
                    fd = os.open(absfilename, os.O_WRONLY | os.O_CREAT | os.O_EXCL)
                    f = os.fdopen(fd, 'wb')
                    f.write(fileobj.get_data())
                    f.close()
コード例 #9
0
class PDFMine:
    def __init__(self, filename):
        self.result = {}
        self.filename = filename
        self.fp = open(filename, "rb")
        self.parser = PDFParser(self.fp)
        self.doc = PDFDocument()
        self.parser.set_document(self.doc)
        self.doc.set_parser(self.parser)
        self.doc.initialize()
        self.pagecount = self.pgcount()
        print "Page count %i" % self.pagecount
        if self.doc.is_extractable:
            print "Starting extraction of %s" % self.filename
        else:
            print "Oops, error extracting %s" % self.filename
            raise ()

    def close(self):
        self.fp.close()

    def pgcount(self):
        count = 0
        for page in self.doc.get_pages():
            count = count + 1
        return count

    def save_video(self, targetdir):
        """Saves all your videos to targetdir """
        for page in self.doc.get_pages():
            if (page.annots):
                obj = self.doc.getobj(page.annots.objid)
                for i in obj:
                    annotobj = i.resolve()
                    try:
                        if (annotobj["Subtype"].name == 'RichMedia'):
                            linktype = "media"
                            data = annotobj["RichMediaContent"].resolve()
                            dataobj = data["Assets"].resolve()
                            fstream = dataobj["Names"][1].resolve()
                            filename = fstream["F"]
                            fdata = fstream['EF']['F'].resolve().get_data()
                            f = open(os.path.join(targetdir, filename), "w")
                            f.write(fdata)
                            f.close()
                    except:
                        pass

    def _rect(self, bbox):
        """ Changes a bounding box into something we can use 
		with HTML (x,y,width,height measured from top left) """
        pgbox = self.pgbox
        pgwidth = round(abs(pgbox[0] - pgbox[2]))
        pgheight = round(abs(pgbox[1] - pgbox[3]))
        x = round(min(bbox[0], bbox[2]))
        y = pgheight - (round(max(bbox[1], bbox[3])))
        width = round(max(bbox[0], bbox[2]) - min(bbox[0], bbox[2]))
        height = round(max(bbox[1], bbox[3]) - min(bbox[1], bbox[3]))
        result = {"x": x, "y": y, "width": width, "height": height}
        return result

    def _find_objid_pgnum(self, obj):
        """Given a page, return the page number """
        i = 0
        for page in self.doc.get_pages():
            i = i + 1
            if self.doc.getobj(page.pageid) == obj:
                return i
        return False

    def parse_pages(self):
        result = []
        i = 0
        for page in self.doc.get_pages():
            self.pgbox = page.mediabox
            i = i + 1
            print "==== Page %d ====" % i
            result.append(self._parse_page(page))
        return result

    def _parse_page(self, page):
        result = []
        vids = self._parse_video(page)
        if len(vids) > 0:
            result.extend(self._parse_video(page))
        links = self._parse_links(page)
        if len(links) > 0:
            result.extend(links)
        comments = self._parse_comments(page)
        if len(comments) > 0:
            result.extend(comments)
        return result

    def _parse_comments(self, page):
        result = []
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        interpreter.process_page(page)
        layout = device.get_result()
        for obj in layout:
            if isinstance(obj, LTTextBox):
                txt = obj.get_text()
                if (txt.find("[[") >= 0):
                    """ We've found a comment. If it's on top of a rect, return the 
					rect as the bounding box. Else return just the textbox rect """
                    rect = self._rect(self._intersects(layout, obj))
                    commenttxt = {
                        "rect": rect,
                        "comment": txt.replace("]]", "").replace("[[", "")
                    }
                    result.append(commenttxt)
        return result

    def _parse_links(self, page):
        result = []
        if (page.annots):
            obj = self.doc.getobj(page.annots.objid)
            for i in obj:
                annotobj = i.resolve()
                try:
                    if (annotobj["Subtype"].name
                            == 'Link') and (annotobj.has_key("A")):
                        linktype = "link"
                        print "Found link"
                        obj = annotobj["A"].resolve()
                        dest = ""
                        if (obj.has_key('D')):
                            linktype = "bookmark"
                            #print dir(obj["D"])

                            namesobj = self.doc.catalog["Names"].resolve()
                            destsobj = namesobj["Dests"].resolve()
                            for name in destsobj["Names"]:
                                if (hasattr(name[0], "objid")):
                                    pg = name[0].resolve()
                                    dest = self._find_objid_pgnum(pg)

                        if (obj.has_key('URI')):
                            dest = obj['URI']
                        rect = self._rect(annotobj['Rect'])
                        link = {"rect": rect, "type": linktype, "dest": dest}
                        result.append(link)
                except:
                    return result
        return result

    def _parse_video(self, page):
        result = []
        if (page.annots):
            obj = self.doc.getobj(page.annots.objid)
            for i in obj:
                annotobj = i.resolve()
                try:
                    if (annotobj["Subtype"].name == 'RichMedia'):
                        linktype = "media"
                        rect = self._rect(annotobj['Rect'])
                        print "Found video"
                        data = annotobj["RichMediaContent"].resolve()
                        dataobj = data["Assets"].resolve()
                        fstream = dataobj["Names"][1].resolve()
                        filename = fstream["F"]
                        link = {
                            "rect": rect,
                            "type": linktype,
                            "filename": filename
                        }
                        result.append(link)
                except:
                    pass
        return result

    def _intersects(self, layout, obj):
        """ Finds if the obj is contained within another object on the page """
        origbbox = obj.bbox
        for otherobj in layout:
            if obj != otherobj:
                otherbbox = otherobj.bbox
                if (origbbox[0] >=
                        otherbbox[0]) and (origbbox[1] >= otherbbox[1]) and (
                            origbbox[2] <= otherbbox[2]) and (origbbox[3] >=
                                                              otherbbox[3]):
                    return otherbbox
        return origbbox

    """
	We search for 'bookmarks' set in Adobe Acrobat
	"""

    def get_sections(self):
        toc = []
        try:
            outlines = self.doc.get_outlines()
            for (level, title, dest, a, se) in outlines:
                if (dest):
                    objid = dest[0].objid
                    pgobj = dest[0].resolve()
                else:
                    destsobj = a.resolve()
                    pgobj = destsobj["D"][0]
                    objid = pgobj.objid
                x = 1
                for page in self.doc.get_pages():
                    if page.pageid == objid:
                        toc.append({
                            "name": title,
                            "page": x
                        })
                    x = x + 1
        except:
            pass
        return toc

    def test(self):
        print "Starting test on %s" % self.filename
        result = self.parse_pages()
        print result
        print "Found %d pages" % (self.pagecount)
        print self.get_sections()
コード例 #10
0
ファイル: pdfmine.py プロジェクト: 10layer/PDF-Mine
class PDFMine:
	def __init__(self, filename):
		self.result = {}
		self.filename=filename
		self.fp=open(filename, "rb")
		self.parser=PDFParser(self.fp)
		self.doc=PDFDocument()
		self.parser.set_document(self.doc)
		self.doc.set_parser(self.parser)
		self.doc.initialize()
		self.pagecount=self.pgcount()
		print "Page count %i" % self.pagecount
		if self.doc.is_extractable:
			print "Starting extraction of %s" % self.filename
		else:
			print "Oops, error extracting %s" % self.filename
			raise()
		
	def close(self):
		self.fp.close()
		
	def pgcount(self):
		count=0;
		for page in self.doc.get_pages():
			count=count+1
		return count
		
	def save_video(self, targetdir):
		"""Saves all your videos to targetdir """
		for page in self.doc.get_pages():
			if (page.annots):
				obj=self.doc.getobj(page.annots.objid)
				for i in obj:
					annotobj=i.resolve()
					try:
						if (annotobj["Subtype"].name=='RichMedia'):
							linktype="media"
							data=annotobj["RichMediaContent"].resolve()
							dataobj=data["Assets"].resolve()
							fstream=dataobj["Names"][1].resolve()
							filename=fstream["F"]
							fdata=fstream['EF']['F'].resolve().get_data()
							f=open(os.path.join(targetdir,filename),"w")
							f.write(fdata)
							f.close()
					except:
						pass
		
	def _rect(self, bbox):
		""" Changes a bounding box into something we can use 
		with HTML (x,y,width,height measured from top left) """
		pgbox=self.pgbox
		pgwidth=round(abs(pgbox[0]-pgbox[2]))
		pgheight=round(abs(pgbox[1]-pgbox[3]))
		x=round(min(bbox[0], bbox[2]))
		y=pgheight-(round(max(bbox[1],bbox[3])))
		width=round(max(bbox[0], bbox[2])-min(bbox[0], bbox[2]))
		height=round(max(bbox[1], bbox[3])-min(bbox[1], bbox[3]))
		result={"x":x, "y":y, "width":width, "height":height}
		return result
		
	def _find_objid_pgnum(self, obj):
		"""Given a page, return the page number """
		i=0
		for page in self.doc.get_pages():
			i=i+1
			if self.doc.getobj(page.pageid)==obj:
				return i
		return False
	
	def parse_pages(self):
		result=[]
		i=0
		for page in self.doc.get_pages():
			self.pgbox=page.mediabox
			i=i+1
			print "==== Page %d ====" % i
			result.append(self._parse_page(page))
		return result
	
	def _parse_page(self, page):
		result=[]
		vids=self._parse_video(page)
		if len(vids)>0:
			result.extend(self._parse_video(page))
		links=self._parse_links(page)
		if len(links)>0:
			result.extend(links)
		comments=self._parse_comments(page)
		if len(comments)>0:
			result.extend(comments)
		return result
	
	def _parse_comments(self, page):
		result=[]
		rsrcmgr = PDFResourceManager()
		laparams = LAParams()
		device = PDFPageAggregator(rsrcmgr, laparams=laparams)
		interpreter = PDFPageInterpreter(rsrcmgr, device)
		interpreter.process_page(page)
		layout = device.get_result()
		for obj in layout:
			if isinstance(obj, LTTextBox):
				txt=obj.get_text()
				if (txt.find("[[")>=0):
					""" We've found a comment. If it's on top of a rect, return the 
					rect as the bounding box. Else return just the textbox rect """
					rect=self._rect(self._intersects(layout,obj))
					commenttxt={"rect":rect, "comment":txt.replace("]]","").replace("[[","")}
					result.append(commenttxt)
		return result
		
	def _parse_links(self, page):
		result=[]
		if (page.annots):
			obj=self.doc.getobj(page.annots.objid)
			for i in obj:
				annotobj=i.resolve()
				try:
					if (annotobj["Subtype"].name=='Link') and (annotobj.has_key("A")):
						linktype="link"
						print "Found link"
						obj=annotobj["A"].resolve()
						dest=""
						if (obj.has_key('D')):
							linktype="bookmark"
							#print dir(obj["D"])
							
							namesobj=self.doc.catalog["Names"].resolve()
							destsobj=namesobj["Dests"].resolve()
							for name in destsobj["Names"]:
								if (hasattr(name[0], "objid")):
									pg=name[0].resolve()
									dest=self._find_objid_pgnum(pg)
									
						if (obj.has_key('URI')):
							dest=obj['URI']
						rect=self._rect(annotobj['Rect'])
						link={"rect":rect, "type":linktype,"dest": dest}
						result.append(link)
				except:
					return result
		return result
			
	def _parse_video(self, page):
		result=[]
		if (page.annots):
			obj=self.doc.getobj(page.annots.objid)
			for i in obj:
				annotobj=i.resolve()
				try:
					if (annotobj["Subtype"].name=='RichMedia'):
						linktype="media"
						rect=self._rect(annotobj['Rect'])
						print "Found video"
						data=annotobj["RichMediaContent"].resolve()
						dataobj=data["Assets"].resolve()
						fstream=dataobj["Names"][1].resolve()
						filename=fstream["F"]
						link={"rect":rect, "type":linktype, "filename":filename}
						result.append(link)
				except:
					pass
		return result
			
	def _intersects(self, layout, obj):
		""" Finds if the obj is contained within another object on the page """
		origbbox=obj.bbox
		for otherobj in layout:
			if obj!=otherobj:
				otherbbox=otherobj.bbox
				if (origbbox[0]>=otherbbox[0]) and (origbbox[1]>=otherbbox[1]) and (origbbox[2]<=otherbbox[2]) and (origbbox[3]>=otherbbox[3]):
					return otherbbox
		return origbbox
	
	"""
	We search for 'bookmarks' set in Adobe Acrobat
	"""
	def get_sections(self):
		toc=[]
		try:
			outlines = self.doc.get_outlines()
			for (level,title,dest,a,se) in outlines:
				if (dest):
				    objid=dest[0].objid
				    pgobj=dest[0].resolve()
				else:
				    destsobj=a.resolve()
				    pgobj=destsobj["D"][0]
				    objid=pgobj.objid
				x=1;
				for page in self.doc.get_pages():
				    if page.pageid==objid:
				    	toc.append({"name": title, "page": x});
				    x=x+1
		except:
			pass
		return toc
			
	def test(self):
		print "Starting test on %s" % self.filename
		result=self.parse_pages()
		print result
		print "Found %d pages" % (self.pagecount)
		print self.get_sections()
コード例 #11
0
ファイル: pdfc.py プロジェクト: kubikanber/PDFisleme
            pr = obj["P"]
        elif obj.get("Type") and obj["Type"].name == "Annot":
            pages.append(objid)

        try:
            pi = pages.index(pr.objid) + 1
        except:
            pi = -1
    print(objid, pi, obj["Subj"], obj["T"], obj["Contents"])


fp = open("simple1.pdf", "rb")
parser = PDFParser(fp)
doc = PDFDocument()
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize(pdf_pwd)

visited = set()
for xref in doc.xrefs:
    for objid in xref.get_objids():
        if objid in visited: continue
        visited.add(objid)
        try:
            obj = doc.getobj(objid)
            if obj is None: continue
            extract(objid, obj)
            print("oldu.")
        except:
            print(sys.stderr, "not found: %r")
コード例 #12
0
ファイル: __init__.py プロジェクト: Spencer-Sleep/Programs
def cargoDoc():
    fp = open(r"C:\Users\ssleep\Documents\Programming\Cargo Docker\Thursday\LCBO\601331975 PARS MANIFESTS.pdf", 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    visited = set()
    
    pars = []
    
    for xref in doc.xrefs:
        for objid in xref.get_objids():
            if objid in visited: continue
            visited.add(objid)
            obj = doc.getobj(objid)
            if obj is None: continue
            pars = extract(objid,obj)

    pdfFileObj = open(specificPath, 'rb')
    pdfReader = PdfFileReader(pdfFileObj)
    
    fields = pdfReader.getFields()
#     print(len(fields)-15)


    for i in range(len(fields)-15):
        containerNumber = ""
        weight = ""
        consignee = ""
        shipper = ""
        eta = ""
        portOfLoading = ""
        portOfDischarge = ""
        description = ""
        if i == 0:
#             prefix = str(i) + "."
            containerNumber = fields["Container Row1"].value
            weight = float(fields["Weight KGRow1"].value)
            consignee = fields["Consignee"].value
            shipper = fields["Shipper"].value
            eta = fields["ETA DATE"].value
            portOfLoading = fields["undefined"].value
            portOfDischarge = fields["Port of Discharge"].value
            description = fields["Description of goods"].value
        else:
            for j in list(fields.keys()):
                if j==str(i):
                    for k in list(fields[j]["/Kids"]):
                        try:
                            if(k.getObject()['/T']=="WO"):
                                wo=k.getObject()['/V']
                            elif(k.getObject()['/T']=="Container Row1"):
                                containerNumber=k.getObject()['/V']
                            elif(k.getObject()['/T']=="SizeRow1"):
                                size=k.getObject()['/V']
                            elif(k.getObject()['/T']=="Weight KGRow1"):
                                weight=float(k.getObject()['/V'])
                            elif(k.getObject()['/T']=="Consignee"):
                                consignee=k.getObject()['/V']
                            elif(k.getObject()['/T']=="Shipper"):
                                shipper=k.getObject()['/V']
                            elif(k.getObject()['/T']=="ETA DATE"):
                                eta=k.getObject()['/V']
                            elif(k.getObject()['/T']=="undefined"):
                                portOfLoading=k.getObject()['/V']
                            elif(k.getObject()['/T']=="Port of Discharge"):
                                portOfDischarge=k.getObject()['/V']
                            elif(k.getObject()['/T']=="Description of goods"):
                                description=k.getObject()['/V']    
                        except KeyError:
                            True