Esempio n. 1
0
def check_pdf(file):
    failed = False
    print "check_pdf:", file
    try:
        fp = open(file, 'rb')
        parser = PDFParser(fp)
        document = PDFDocument(parser)
        print "extractable:", document.is_extractable, ",modifiable:", document.is_modifiable, ", printable:", document.is_printable
        succed_files.put(file)
        succed_path = os.path.split(file)[0]
        succed_mark_file = os.path.join(succed_path, "success_mark.txt")
        f = open(succed_mark_file, "w")
        f.close()
        print "succed mark file generated:", succed_mark_file
    except:
        traceback.print_exc()
        failed_files.put(file)
        failed = True


#         print "move fail file to dir", fail_dir, ",", file
#         shutil.move(file, fail_dir)  # 必须先关闭文件才能移动,不然报错  os.unlink(src) WindowsError: [Error 32]  http://jining2593.blog.163.com/blog/static/2770148420101024114428257/
    finally:
        parser.close()
        fp.close()
        if failed == True:
            print "move fail file to dir", fail_dir, ",", file
            shutil.move(file, fail_dir)
        print "all:", len(all_pdf_files), ",succed:", succed_files.qsize(
        ), ",failed:", failed_files.qsize()
Esempio n. 2
0
def with_pdf (pdf_doc, fn, pdf_pwd, *args):
	"""Open the pdf document, and apply the function, returning the results"""
	result = None
	try:
		# open the pdf file
		fp = open(pdf_doc, 'rb')
		# create a parser object associated with the file object
		parser = PDFParser(fp)
		# create a PDFDocument object that stores the document structure
		doc = PDFDocument()
		# connect the parser and document objects
		parser.set_document(doc)
		doc.set_parser(parser)
		# supply the password for initialization
		doc.initialize(pdf_pwd)

		if doc.is_extractable:
				# apply the function and return the result
				result = fn(doc, *args)

		# close the pdf file
		parser.close()
		fp.close()
	except IOError:
		# the file doesn't exist or similar problem
		pass
	return result
Esempio n. 3
0
def getData(fileName):
 doc = PDFDocument()
 fp = file(fileName, 'rb')
 parser = PDFParser(fp)
 try:
  parser.set_document(doc)
  doc.set_parser(parser)
 except:
  return "error"
   
 parser.close()
 fp.close()
 try:
  for xref in doc.xrefs:
   info_ref=xref.trailer.get('Info')
   if info_ref:
    info=resolve1(info_ref)
   metadata=info
   if metadata == None:
    return "Empty metadata"
   else:
    if metadata.has_key('Author'):
     print("Author "+metadata['Author'])
    if metadata.has_key('Company'):
     print("Company "+metadata['Company'])
    if metadata.has_key('Producer'):
     print("Producer "+metadata['Producer'])
    if metadata.has_key('Creator'):
     print("Creator "+metadata['Creator'])         
 except Exception,e:
  print "\t [x] Error in PDF extractor"
  return e 
Esempio n. 4
0
    def getData(self):
        doc = PDFDocument()
        fp = file(self.fname, 'rb')
        parser = PDFParser(fp)
        try:
            parser.set_document(doc)
            doc.set_parser(parser)
            doc.initialize(self.password)
        except:
            return "error"

        parser.close()
        fp.close()
        #try:
        #    metadata = resolve1(doc.catalog['Metadata'])
        #    return "ok"
        #except:
        #    print "[x] Error in PDF extractor, Metadata catalog"
        try:
            for xref in doc.xrefs:
                info_ref = xref.trailer.get('Info')
                if info_ref:
                    info = resolve1(info_ref)
                self.metadata = info
                self.raw = info
            if self.raw == None:
                return "Empty metadata"
            else:
                return "ok"
        except Exception, e:
            return e
            print "[x] Error in PDF extractor, Trailer Info"
Esempio n. 5
0
def dumpoutline(outfp,
                fname,
                objids,
                pagenos,
                password='',
                dumpall=False,
                codec=None):
    doc = PDFDocument()
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)
    pages = dict(
        (page.pageid, pageno) for (pageno, page) in enumerate(doc.get_pages()))
    for (level, title, dest, a, se) in doc.get_outlines():
        pageno = None
        if dest:
            dest = resolve1(doc.lookup_name('Dests', dest))
            if isinstance(dest, dict):
                dest = dest['D']
            pageno = pages[dest[0].objid]
        elif a:
            action = a.resolve()
            if isinstance(action, dict):
                subtype = action.get('S')
                if subtype and repr(subtype) == '/GoTo' and action.get('D'):
                    dest = action['D']
                    pageno = pages[dest[0].objid]
        outfp.write(repr((level, title, dest, pageno)) + '\n')
    parser.close()
    fp.close()
    return
Esempio n. 6
0
	def getData(self):
		doc = PDFDocument()
		fp = file(self.fname, 'rb')
		parser = PDFParser(fp)
		try:
			parser.set_document(doc)
			doc.set_parser(parser)
			doc.initialize(self.password)
		except:
			return "error"
		
		parser.close()
		fp.close()
		#try:
		#	metadata = resolve1(doc.catalog['Metadata'])
		#	return "ok"
		#except:
		#	print "[x] Error in PDF extractor, Metadata catalog"
		try:
			for xref in doc.xrefs:
				info_ref=xref.trailer.get('Info')
				if info_ref:
					info=resolve1(info_ref)
				self.metadata=info
				self.raw = info
			if self.raw == None:
				return "Empty metadata"
			else:
				return "ok"
		except Exception,e:
			return e 
			print "\t [x] Error in PDF extractor, Trailer Info"
Esempio n. 7
0
def with_pdf(pdf_doc, fn, pdf_pwd, *args):
    """Open the pdf document, and apply the function, returning the results"""
    result = None
    try:
        # open the pdf file
        fp = open(pdf_doc, 'rb')
        # create a parser object associated with the file object
        parser = PDFParser(fp)
        # create a PDFDocument object that stores the document structure
        doc = PDFDocument()
        # connect the parser and document objects
        parser.set_document(doc)
        doc.set_parser(parser)
        # supply the password for initialization
        doc.initialize(pdf_pwd)

        if doc.is_extractable:
            # apply the function and return the result
            result = fn(doc, *args)

        # close the pdf file
        parser.close()
        fp.close()
    except IOError:
        # the file doesn't exist or similar problem
        pass
    return result
Esempio n. 8
0
def dumpoutline(outfp, fname, objids, pagenos, password='',
                dumpall=False, codec=None):
    doc = PDFDocument()
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)
    pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
    for (level,title,dest,a,se) in doc.get_outlines():
        pageno = None
        if dest:
            dest = resolve1( doc.lookup_name('Dests', dest) )
            if isinstance(dest, dict):
                dest = dest['D']
            pageno = pages[dest[0].objid]
        elif a:
            action = a.resolve()
            if isinstance(action, dict):
                subtype = action.get('S')
                if subtype and repr(subtype) == '/GoTo' and action.get('D'):
                    dest = action['D']
                    pageno = pages[dest[0].objid]
        outfp.write(repr((level,title,dest,pageno))+'\n')
    parser.close()
    fp.close()
    return
Esempio n. 9
0
def dumpoutline(
    outfp,
    fname,
    objids,
    pagenos,
    password="",
    dumpall=False,
    codec=None,
    extractdir=None,
):
    fp = open(fname, "rb")
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    pages = {
        page.pageid: pageno
        for (pageno, page) in enumerate(PDFPage.create_pages(doc), 1)
    }

    def resolve_dest(dest):
        if isinstance(dest, str):
            dest = resolve1(doc.get_dest(dest))
        elif isinstance(dest, PSLiteral):
            dest = resolve1(doc.get_dest(dest.name))
        if isinstance(dest, dict):
            dest = dest["D"]
        if isinstance(dest, PDFObjRef):
            dest = dest.resolve()
        return dest

    try:
        outlines = doc.get_outlines()
        outfp.write("<outlines>\n")
        for (level, title, dest, a, se) in outlines:
            pageno = None
            if dest:
                dest = resolve_dest(dest)
                pageno = pages[dest[0].objid]
            elif a:
                action = a
                if isinstance(action, dict):
                    subtype = action.get("S")
                    if subtype and repr(subtype) == "/'GoTo'" and action.get("D"):
                        dest = resolve_dest(action["D"])
                        pageno = pages[dest[0].objid]
            s = e(title).encode("utf-8", "xmlcharrefreplace")
            outfp.write('<outline level="{!r}" title="{}">\n'.format(level, s))
            if dest is not None:
                outfp.write("<dest>")
                dumpxml(outfp, dest)
                outfp.write("</dest>\n")
            if pageno is not None:
                outfp.write("<pageno>%r</pageno>\n" % pageno)
            outfp.write("</outline>\n")
        outfp.write("</outlines>\n")
    except PDFNoOutlines:
        pass
    parser.close()
    fp.close()
    return
Esempio n. 10
0
def dumpoutline(outfp: TextIO,
                fname: str,
                objids: Any,
                pagenos: Container[int],
                password: str = '',
                dumpall: bool = False,
                codec: Optional[str] = None,
                extractdir: Optional[str] = None) -> None:
    fp = open(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    pages = {
        page.pageid: pageno
        for (pageno, page) in enumerate(PDFPage.create_pages(doc), 1)
    }

    def resolve_dest(dest: object) -> Any:
        if isinstance(dest, (str, bytes)):
            dest = resolve1(doc.get_dest(dest))
        elif isinstance(dest, PSLiteral):
            dest = resolve1(doc.get_dest(dest.name))
        if isinstance(dest, dict):
            dest = dest['D']
        if isinstance(dest, PDFObjRef):
            dest = dest.resolve()
        return dest

    try:
        outlines = doc.get_outlines()
        outfp.write('<outlines>\n')
        for (level, title, dest, a, se) in outlines:
            pageno = None
            if dest:
                dest = resolve_dest(dest)
                pageno = pages[dest[0].objid]
            elif a:
                action = a
                if isinstance(action, dict):
                    subtype = action.get('S')
                    if subtype and repr(subtype) == '/\'GoTo\'' and action.get(
                            'D'):
                        dest = resolve_dest(action['D'])
                        pageno = pages[dest[0].objid]
            s = escape(title)
            outfp.write('<outline level="{!r}" title="{}">\n'.format(level, s))
            if dest is not None:
                outfp.write('<dest>')
                dumpxml(outfp, dest)
                outfp.write('</dest>\n')
            if pageno is not None:
                outfp.write('<pageno>%r</pageno>\n' % pageno)
            outfp.write('</outline>\n')
        outfp.write('</outlines>\n')
    except PDFNoOutlines:
        pass
    parser.close()
    fp.close()
    return
Esempio n. 11
0
def dumpoutline(outfp,
                fname,
                objids,
                pagenos,
                password='',
                dumpall=False,
                codec=None,
                extractdir=None):
    fp = open(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    pages = dict((page.pageid, pageno)
                 for (pageno, page) in enumerate(PDFPage.create_pages(doc), 1))

    def resolve_dest(dest):
        if isinstance(dest, str):
            dest = resolve1(doc.get_dest(dest))
        elif isinstance(dest, PSLiteral):
            dest = resolve1(doc.get_dest(dest.name))
        if isinstance(dest, dict):
            dest = dest['D']
        if isinstance(dest, PDFObjRef):
            dest = dest.resolve()
        return dest

    try:
        outlines = doc.get_outlines()
        outfp.write('<outlines>\n')
        for (level, title, dest, a, se) in outlines:
            pageno = None
            if dest:
                dest = resolve_dest(dest)
                pageno = pages[dest[0].objid]
            elif a:
                action = a
                if isinstance(action, dict):
                    subtype = action.get('S')
                    if subtype and repr(subtype) == '/\'GoTo\'' and action.get(
                            'D'):
                        dest = resolve_dest(action['D'])
                        pageno = pages[dest[0].objid]
            s = e(title).encode('utf-8', 'xmlcharrefreplace')
            outfp.write('<outline level="%r" title="%s">\n' % (level, s))
            if dest is not None:
                outfp.write('<dest>')
                dumpxml(outfp, dest)
                outfp.write('</dest>\n')
            if pageno is not None:
                outfp.write('<pageno>%r</pageno>\n' % pageno)
            outfp.write('</outline>\n')
        outfp.write('</outlines>\n')
    except PDFNoOutlines:
        pass
    parser.close()
    fp.close()
    return
Esempio n. 12
0
def extract_TOC(pdf_path):
    fp = open(pdf_path, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser, b"")
    pages = {page.pageid: pageno for (pageno, page)
             in enumerate(PDFPage.create_pages(document), 1)}
    
    def resolve_dest(dest):
        if isinstance(dest, str):
            dest = resolve1(document.get_dest(dest))
        elif isinstance(dest, PSLiteral):
            dest = resolve1(document.get_dest(dest.name))
        if isinstance(dest, dict):
            dest = dest['D']
        if isinstance(dest, PDFObjRef):
            dest = dest.resolve()
        return dest
    
    toc = ""

    try:
        outlines = document.get_outlines()
        toc += '<outlines>\n'
        for (level, title, dest, a, se) in tqdm(outlines, leave=False):
            pageno = None
            if dest:
                dest = resolve_dest(dest) # Very imperative and can cause errors that are hard to debug since we overwrite
                pageno = pages[dest[0].objid]
            elif a:
                action = a
                if isinstance(action, dict):
                    subtype = action.get("S")
                    if subtype and repr(subtype) == "/'GoTo'" and action.get("D"):
                        dest = resolve_dest(action.get("D"))
                        pageno = pages[dest[0].objid]
            string = escape_str(title).encode("utf-8", "xmlcharrefreplace")
            toc += '<outline level="{!r}" title="{}">\n'.format(level, string)
            if dest is not None:
                toc += "<dest>"
                toc = dumpxml(toc, dest)
                toc += "</dest>\n"
            if pageno is not None:
                toc += "<pageno>{}</pageno>\n".format(pageno)
            toc += "</outline>\n"
        toc += "</outlines>\n"
    except PDFNoOutlines:
        pass
    
    parser.close()
    fp.close()
    return toc
Esempio n. 13
0
def succ_test():
    try:
        os.chdir(r'F:\allitebooks\making-games')
        fp = open('Making Games.pdf', 'rb')
        parser = PDFParser(fp)
        document = PDFDocument(parser)
        print "extractable:", document.is_extractable, ",modifiable:", document.is_modifiable, ", printable:", document.is_printable
        outlines = document.get_outlines()
        print outlines
    except:
        traceback.print_exc()
    finally:
        parser.close()
        fp.close()
def get_module_codes(url):
    r = requests.get(url, stream=True)

    with open("temp/" + url[-20:], 'wb') as f:
        f.write(r.content)

    with open("temp/" + url[-20:], 'rb') as f:
        parser = PDFParser(f)
        document = PDFDocument(parser)

        # Create PDFResourceManager object that stores shared resources such as fonts or images
        resource_manager = PDFResourceManager()
        la_params = LAParams()

        # Extract the device to page aggregator to get LT object elements
        device = PDFPageAggregator(resource_manager, laparams=la_params)

        # Interpreter needs to be connected to resource manager for shared resources and device
        interpreter = PDFPageInterpreter(resource_manager, device)

        module_codes = []

        for page in PDFPage.create_pages(document):
            first = True

            interpreter.process_page(page)

            # The device renders the layout from interpreter
            layout = device.get_result()
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBox) or isinstance(
                        lt_obj, LTTextLine):
                    text = lt_obj.get_text().strip()

                    if re.match("\d\w{2}\d{3}", text):
                        if len(text) > 6:
                            print("],")

                            parser.close()
                            return module_codes

                        if not first:
                            print(", ", end=''),
                        else:
                            first = False

                        print("\"" + text + "\"", end=''),

    parser.close()
    return module_codes
Esempio n. 15
0
def dumpoutline(outfp, fname, objids, pagenos, password='',
                dumpall=False, codec=None):
    doc = PDFDocument()
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)
    pages = dict((page.pageid, pageno) for (pageno, page) in enumerate(doc.get_pages()))

    def resolve_dest(dest):
        if isinstance(dest, str):
            dest = resolve1(doc.get_dest(dest))
        elif isinstance(dest, PSLiteral):
            dest = resolve1(doc.get_dest(dest.name))
        if isinstance(dest, dict):
            dest = dest['D']
        return dest

    try:
        outlines = doc.get_outlines()
        outfp.write('<outlines>\n')
        for (level, title, dest, a, se) in outlines:
            pageno = None
            if dest:
                dest = resolve_dest(dest)
                pageno = pages[dest[0].objid]
            elif a:
                action = a.resolve()
                if isinstance(action, dict):
                    subtype = action.get('S')
                    if subtype and repr(subtype) == '/GoTo' and action.get('D'):
                        dest = resolve_dest(action['D'])
                        pageno = pages[dest[0].objid]
            s = e(title).encode('utf-8', 'xmlcharrefreplace')
            outfp.write('<outline level="%r" title="%s">\n' % (level, s))
            if dest is not None:
                outfp.write('<dest>')
                dumpxml(outfp, dest)
                outfp.write('</dest>\n')
            if pageno is not None:
                outfp.write('<pageno>%r</pageno>\n' % pageno)
            outfp.write('</outline>\n')
        outfp.write('</outlines>\n')
    except PDFNoOutlines:
        pass
    parser.close()
    fp.close()
    return
Esempio n. 16
0
def dumpoutline(outfp, fname, objids, pagenos, password="", dumpall=False, codec=None):
    fp = file(fname, "rb")
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    doc.initialize(password)
    pages = dict((page.pageid, pageno) for (pageno, page) in enumerate(PDFPage.create_pages(doc)))

    def resolve_dest(dest):
        if isinstance(dest, str):
            dest = resolve1(doc.get_dest(dest))
        elif isinstance(dest, PSLiteral):
            dest = resolve1(doc.get_dest(dest.name))
        if isinstance(dest, dict):
            dest = dest["D"]
        return dest

    try:
        outlines = doc.get_outlines()
        outfp.write("<outlines>\n")
        for (level, title, dest, a, se) in outlines:
            pageno = None
            if dest:
                dest = resolve_dest(dest)
                pageno = pages[dest[0].objid]
            elif a:
                action = a.resolve()
                if isinstance(action, dict):
                    subtype = action.get("S")
                    if subtype and repr(subtype) == "/GoTo" and action.get("D"):
                        dest = resolve_dest(action["D"])
                        pageno = pages[dest[0].objid]
            s = e(title).encode("utf-8", "xmlcharrefreplace")
            outfp.write('<outline level="%r" title="%s">\n' % (level, s))
            if dest is not None:
                outfp.write("<dest>")
                dumpxml(outfp, dest)
                outfp.write("</dest>\n")
            if pageno is not None:
                outfp.write("<pageno>%r</pageno>\n" % pageno)
            outfp.write("</outline>\n")
        outfp.write("</outlines>\n")
    except PDFNoOutlines:
        pass
    parser.close()
    fp.close()
    return
Esempio n. 17
0
def dumpoutline(outfp, fname, objids, pagenos, password='',
                dumpall=False, codec=None):
  doc = PDFDocument()
  fp = file(fname, 'rb')
  parser = PDFParser(doc, fp)
  doc.initialize(password)
  pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
  for (level,title,dest,a,se) in doc.get_outlines():
    pageno = None
    if dest:
      dest = resolve1( doc.lookup_name('Dests', dest) )
      if isinstance(dest, dict):
        dest = dest['D']
      pageno = pages[dest[0].objid]
    outfp.write(repr((level,title,dest,pageno))+'\n')
  parser.close()
  fp.close()
  return
Esempio n. 18
0
 def getData(self):
     try:
         doc = PDFDocument()
         fp = file(self.fname, 'rb')
         parser = PDFParser(fp)
         parser.set_document(doc)
         doc.set_parser(parser)
         doc.initialize(self.password)
         metadata = resolve1(doc.catalog['Metadata'])
         parser.close()
         fp.close()
         for xref in doc.xrefs:
             info_ref = xref.trailer.get('Info')
             if info_ref:
                 info = resolve1(info_ref)
             self.metadata = info
             self.raw = info
         return "ok"
     except:
         return "error"
Esempio n. 19
0
	def getData(self):
		try:
			doc = PDFDocument()
			fp = file(self.fname, 'rb')
			parser = PDFParser(fp)
			parser.set_document(doc)
			doc.set_parser(parser)
			doc.initialize(self.password)
			metadata = resolve1(doc.catalog['Metadata'])
			parser.close()
			fp.close()
			for xref in doc.xrefs:
				info_ref=xref.trailer.get('Info')
				if info_ref:
					info=resolve1(info_ref)
				self.metadata=info
				self.raw = info
			return "ok"
		except:
			return "error"
Esempio n. 20
0
    def addpdf(self):
        """
        Add a pdf to the list of pdf to be converted.

        ( See Queue class )
        """

        self.controller.logger.info("addpdf is called.")

        filename = tkFileDialog.askopenfilename(
            initialdir="/",
            title="Select file",
            filetypes=(("pdf files", "*.pdf"), ("all files", "*.*")))
        fp = open(str(filename), 'rb')
        parser = PDFParser(fp)
        document = PDFDocument(parser)
        num_pages = 0

        for page in PDFPage.create_pages(document):
            num_pages += 1

        parser.close()
        fp.close()

        self.Queue.add_pdf(filename, num_pages)

        pdfs = ""
        for e in range(len(self.Queue.queue)):
            pdfs += str(self.Queue.queue[e]) + "\n"

        self.label10 = tk.Label(self,
                                text=str(pdfs),
                                width=70,
                                height=10,
                                borderwidth=1,
                                relief="groove",
                                font=("Verdana", 8, "bold"),
                                fg="dark slate gray").place(x=260, y=170)

        self.controller.logger.info("a pdf has been added.")
Esempio n. 21
0
    def parse_data(self, path, filetype, **kwargs):
        self.filename = path
        self.metadata = {}

        if not filetype == FileTypes.PDF:
            return None

        with open(self.filename, 'rb') as fp:
            parser = PDFParser(fp)
            doc = PDFDocument(parser)

            if doc:
                try:
                    for xref in doc.xrefs:
                        info_ref = xref.trailer.get('Info')
                        info = None
                        if info_ref:
                            info = resolve1(info_ref)
                        self.metadata = info
                        for k, v in info.items():
                            if isinstance(v, PDFObjRef):
                                self.metadata[k] = resolve1(v)
                        break
                    if not self.metadata:
                        self.errors.append('No metadata found')
                        out = None
                    else:
                        self._parse_data()
                        out = self
                except Exception as e:
                    self.logger.error(str(e))
                    self.errors.append(str(e))
                    out = None
            else:
                self.errors.append('Cannot parse document')

            parser.close()
        return out
Esempio n. 22
0
def dumpoutline(outfp,
                fname,
                objids,
                pagenos,
                password='',
                dumpall=False,
                codec=None):
    doc = PDFDocument()
    fp = file(fname, 'rb')
    parser = PDFParser(doc, fp)
    doc.initialize(password)
    pages = dict(
        (page.pageid, pageno) for (pageno, page) in enumerate(doc.get_pages()))
    for (level, title, dest, a, se) in doc.get_outlines():
        pageno = None
        if dest:
            dest = resolve1(doc.lookup_name('Dests', dest))
            if isinstance(dest, dict):
                dest = dest['D']
            pageno = pages[dest[0].objid]
        outfp.write(repr((level, title, dest, pageno)) + '\n')
    parser.close()
    fp.close()
    return
Esempio n. 23
0
def process_pdf(file_name, type):
    # Open a PDF file
    print('reading from', file_name)

    if (type == FileType.ONLINE):
        url_name = file_name
        file_name = file_name.split('/')[-1]
        if not os.path.exists(file_name):
            try:
                wget.download(url_name)
                print()
            except urllib.error.HTTPError as err:
                print(err)
                return {}  # return an empty dictionary

    # since we are using parallel programming two files might end up having the same name
    # therefore we change the files into some random name as they will be deleted anyways
    if FileType.ONLINE:
        temp_name = f'{str(random.randint(1,2000))}.pdf'
        os.rename(file_name, temp_name)
        file_name = temp_name

    fp = open(file_name, 'rb')
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Set parameters for analysis.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Store Information and Data
    courses = {}  # store courses info in a dictionary
    # Process each page contained in the document.
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()

        elements = []

        for element in layout:
            elements.append(element)

        with concurrent.futures.ThreadPoolExecutor() as executor:
            for element in elements:
                executor.submit(process_element, args=[element, courses])

    parser.close()
    fp.close()
    if type == FileType.ONLINE:
        os.remove(file_name)  # remove file after processing
    # print(len(courses),'courses has been extracted...\n')
    return courses
Esempio n. 24
0
class PdfElementIndexer(AbstractIndexTask):
    def __init__(self):
        self.log = logging.getLogger('django')
        self.limit = None
        self.catalogs = None
        self.aws_url = '{0}/{1}/{2}.pdf'.format(S3['endpoint'],
                                                S3['pdf_bucket'], '{0}')
        self._resource_mgr = PDFResourceManager()
        self._layout_params = LAParams()
        self.parser = None
        self.temp = None

    def _fetch_document(self, cl_id):
        url = self.aws_url.format(cl_id)
        self.log.debug('Retrieving PDF URL [{0}]'.format(url))

        req = requests.get(url, stream=True)
        if req.status_code >= 400:
            raise requests.RequestException(
                'URL [{0}] return status [{1}]'.format(url, req.status_code))

        req.raw.decode_content = True
        self.temp = tempfile.TemporaryFile()
        self.temp.write(req.raw.data)
        self.temp.seek(0)
        self.log.info(
            'Successful Retrieval and temporary file creation.  Initializing PDF Extraction'
        )

        self.log.info('Initializing PDF Parser')
        self.parser = PDFParser(self.temp)

        self.log.info('Initializing PDF Document')
        doc = PDFDocument(self.parser)

        self.log.info('Linking Document and Parser')
        self.parser.set_document(doc)

        req.close()

        return doc

    def parse_lt_objects(self, layout, index, text=[]):
        """
        Iterates over a list of LT* objects and captures the text contained within,  Images are skipped
        :param layout: List of LT* objects retrieved from the PDFPage instance
        :param index: Current page number
        :return: String of text
        """

        self.log.debug('Processing LT objects for page [{0}]'.format(index))
        text_content = []
        page_text = {
        }  # k = (x0, x1) of bounding box, v = list of text strings within that column

        for obj in layout:
            if isinstance(obj, LTTextBox) or isinstance(obj, LTTextLine):
                self.log.debug('[{0}] object found'.format(type(obj)))
                page_text = self._update_text_hash(page_text, obj)
            elif isinstance(obj, LTFigure):
                # LTFigure objects are containers for other LT* objects, so recurse through children
                self.log.debug(
                    'LTFigure object found, recursing to process children nodes'
                )
                text_content.append(
                    self.parse_lt_objects(obj, index, text_content))

        self.log.debug('Page [{0}] extracted'.format(index))
        return page_text

    def _update_text_hash(self, text, obj, pct=0.2):
        """
            Use the bbox x0, x1 values within :param pct to produce lists of associated text within the hash

            :param text: dict of page text in the format {(x0, x1) : [list of strings in that column]
            :param lt_obj: LineText object
            :return: hash of text values mapped to bounding boxes
            """

        x_0 = obj.bbox[0]
        x_1 = obj.bbox[1]

        key_found = False
        self.log.debug('Updating page text hash for bbox [({0}, {1})]'.format(
            x_0, x_1))

        for k, v in text.items():
            hash_x0 = k[0]
            if x_0 >= (hash_x0 * (1.0 - pct)) and (hash_x0 *
                                                   (1.0 + pct)) >= x_0:
                hash_x1 = k[1]
                if x_1 >= (hash_x1 * (1.0 - pct)) and (hash_x1 *
                                                       (1.0 + pct)) >= x_1:
                    # text inside this LT object was positioned at the same width as a prior series of text, so it
                    # belongs together
                    key_found = True
                    v.append(self._remove_non_ascii(obj.get_text()))
                    text[k] = v
                    self.log.debug('BBox [{0}, {1}] text updated'.format(
                        x_0, x_1))

        if not key_found:
            # Based on width of bounding box, this text is a new series, so it gets its own entry in the hash
            text[(x_0, x_1)] = [self._remove_non_ascii(obj.get_text())]
            self.log.debug('Created new hash key for bbox [{0}, {1}]'.format(
                x_0, x_1))

        return text

    def _parse_pages(self, document):
        """
        With an open PDFDocument object, get the pages and parse each one.  This is a higher order function to be
        passed in the run() method as the fn parameter
        :param document: PDFDocument object
        :return: list of text extracted
        """
        self.log.info('Initializing Page Aggregator')
        device = PDFPageAggregator(self._resource_mgr,
                                   laparams=self._layout_params)

        self.log.info('Initializing Page Interpreter')
        interpreter = PDFPageInterpreter(self._resource_mgr, device)

        text_content = []

        for idx, page in enumerate(PDFPage.create_pages(document)):
            self.log.debug('Interpreter processing page [{0}]'.format(idx))
            interpreter.process_page(page)

            self.log.debug('Retrieved LTPage object for page')
            layout = device.get_result()

            text_content.append(self.parse_lt_objects(layout, idx))

        self.log.info(
            'Successfully completed text extraction of [{0}] pages'.format(
                len(text_content)))
        return text_content

    def _to_bytestring(self, string, encode='utf-8'):
        """
        Convert a given unicode string to a byte string, using standard encoding.
        :param string: Unicode string
        :param encode: Encoding Format
        :return: bytestring encoded in :param encode forma
        """

        if string:
            if isinstance(string, str):
                return string
            else:
                return string.encode(encode)

    def _remove_non_ascii(self, s):
        # Project uses Python 2.7, which comes with a host of unicode issues.  This attempts to sidestep, as we are
        # concentrating only on English-language documents.
        try:
            return u"".join(i for i in s if ord(i) < 128 and (
                ord(i) >= 32 or ord(i) == 9 or ord(i) == 10 or ord(i) == 13))
        except Exception:
            return ""

    def _close(self):
        if self.parser:
            self.parser.close()
        if self.temp:
            self.temp.close()

    def _save_state(self,
                    cl_id,
                    pdf_validate_status,
                    index_status,
                    documents_indexed=0,
                    index_message=''):
        # We need data in the main application MySQL db updated to reflect the text extraction and indexing status
        validate_catalog, created = PdfValidateCatalog.objects.get_or_create(
            catalog__link_id=cl_id)

        validate_catalog.index_status = index_status
        validate_catalog.message = index_message
        validate_catalog.documents_indexed = documents_indexed
        validate_catalog.save()
        self._close()
        self.log.info('Saving state with message: [{0}]'.format(index_message))

        return validate_catalog

    def _save_to_db(self, data, cl_id, catalog_year, institution):
        # One of two save methods, saves to a relational database configured and optimized for text search
        self.log.info('Starting saving data to database')

        document_list = []
        indexed_date = datetime.now().strftime('%c')
        page_count = 0

        for idx, val in enumerate(data):
            if val != '':
                page_count += 1
                self.log.info('Indexing page [{0}]'.format(idx))

                for k, v in val.iteritems():
                    section_text = '\n'.join(v)
                    section_id = sha512(section_text).hexdigest()
                    entry = PdfIndexDocument(
                        hash_id=section_id,
                        page=page_count,
                        bounds=repr(k),
                        content=section_text.decode('utf-8'),
                        catalog_link=cl_id,
                        catalog_year=catalog_year,
                        institution=institution,
                        indexed_date=indexed_date)

                    document_list.append(entry)

        PdfIndexDocument.objects.bulk_create(document_list)
        return len(document_list)

    def _solr(self, data, cl_id, catalog_year, institution, soft_commit=True):
        # The second of two save methods, saves to a Solr server
        self.log.info(
            'Starting SOLR indexing with instance URL [{0}]'.format(SOLR))
        solr_instance = pysolr.Solr(SOLR, timeout=360)
        page_count = 0
        solr_data = []
        indexed_date = datetime.now().strftime('%c')

        for idx, val in enumerate(data):
            if val != '':
                page_count += 1
                self.log.debug('Indexing page [{0}]'.format(idx))
                for k, v in val.iteritems():

                    if type(
                            k
                    ) is tuple:  # Ensure key is always tuple to be iterated over
                        section_text = '\n'.join(v)
                        section_id = sha512(section_text).hexdigest()
                        solr_data.append({
                            'id':
                            section_id,
                            'page':
                            page_count,
                            'bounds':
                            repr(k),
                            'content':
                            section_text.decode('utf-8'),
                            'catalog_link':
                            cl_id,
                            'catalog_year':
                            catalog_year,
                            'institution':
                            institution,
                            'indexed_date':
                            indexed_date
                        })

        self.log.info('Committing [{0}] pages of content'.format(page_count))
        solr_instance.delete(q='catalog_link:{0}'.format(cl_id))
        solr_instance.add(solr_data, waitSearcher=True)

        return len(solr_data)

    def on_failure(self, exc, task_id, args, kwargs, einfo):
        self.log.error(
            'Error for task [{0}] in indexing PDF document [{1}]'.format(
                task_id, args[0]))
        self.log.error('Einfo: [{0}]'.format(einfo))

    def run(self,
            cl_id,
            catalog_year,
            institution,
            db_insert=False,
            soft_commit=True):
        """
        Main run method for this Celery task.  For a provided :param cl_id, the associated PDF file will be retrieved from
        S3 for text extraction and insert to the SOLR server for search and data retrieval.

        :param cl_id: CatalogLink ID for PDF document to be indexed
        :param catalog_year String for catalog year
        :param institution String for institution name
        :param soft_commit SoftCommit for Solr, default = True  True will refresh the view of the index in a more
        performant manner, without on-disk guarantees
        :return: None
        """

        start_time = datetime.now()

        try:
            pdf_fetch_start = datetime.now()
            pdf_doc = self._fetch_document(cl_id)
            pdf_fetch_elapsed = datetime.now() - pdf_fetch_start
            self.log.info('PDF Initialization elapsed time: [{0}]'.format(
                pdf_fetch_elapsed))

            pdf_parse_start = datetime.now()
            if pdf_doc.is_extractable:
                text = self._parse_pages(pdf_doc)
                self.log.info(
                    'PDF Parsing elapsed time: [{0}]'.format(datetime.now() -
                                                             pdf_parse_start))

            else:
                raise PDFTextExtractionNotAllowed(
                    'File [{0}.pdf] is not extractable to a PDF document'.
                    format(cl_id))

            if db_insert:
                self.log.info('Inserting to database')
                documents_indexed = self._save_to_db(text, cl_id, catalog_year,
                                                     institution)
            else:
                self.log.info('Inserting to Solr')
                documents_indexed = self._solr(text, cl_id, catalog_year,
                                               institution, soft_commit)

            self.log.info(
                'Total elapsed processing time: [{0}]'.format(datetime.now() -
                                                              start_time))

            self._save_state(cl_id, 1, 1, documents_indexed)

            return {
                'state': states.SUCCESS,
                'documents_indexed': documents_indexed
            }

        except (requests.RequestException, PDFException, ValueError,
                Exception) as e:
            self.log.error('{0} - {1}'.format(e, e.message))
            self._save_state(cl_id, 1, -1, index_message=e.message)
            raise e
Esempio n. 25
0
def main(path):
    files = os.listdir(path)
    # print(files)
    dic={}
    for file in files:
        if file.lower().endswith('.pdf'):
            L=[]
            path_file=os.path.join(path,file)
            print('当前处理=',path_file)
            ##########################提取学生信息部分--start##################
            print('*'*30)
            print('解析pdf开始')
            parser=PDFParser(open(path_file,'rb'))
            doc=PDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
            doc.initialize()
            if doc.is_extractable:
                doc_resource=PDFResourceManager()
                doc_device=LAParams()
                doc_resource_device=PDFPageAggregator(doc_resource,laparams=doc_device)
                doc_interpreter=PDFPageInterpreter(doc_resource,doc_resource_device)
                for page in doc.get_pages():
                    # result=''
                    print('exec page')
                    doc_interpreter.process_page(page)
                    layout=doc_resource_device.get_result()
                    for x in layout:
                        print(type(x))
                        if isinstance(x,LTTextBoxHorizontal):
                            result=x.get_text().replace('\n','')
                            print(result)
                            if result.find('学号')>=0 and result.find('姓名')>=0:
                                xh=result.split('学号')[-1].split('姓名')[0]
                                xm=result.split('姓名')[-1].split('性别')[0]
                                L.append(xh+'#'+xm)
                        else:
                            print("x is not LTTextBox")
            else:
                print(file,'is Error!')
            parser.close()
            #########################提取学生信息部分--end############################
            #########################生成学生页码信息部分---start######################
            for index in range(len(L)):
                if L[index] not in dic.keys():
                    dic[L[index]]=str(L.index(L[index]))+'-'+str(index+L.count(L[index])-1)
            ########################处理学生页码信息部分----end#########################
            print('解析pdf结束。')
            print('拆分pdf开始!')
            ########################拆分pdf文件--start################################
            savepath = os.path.join(path, '结果')
            try:
                doc=PdfFileReader(open(path_file,'rb'))
                for k,v in dic.items():
                    pdf=PdfFileWriter()
                    start_page,end_page=int(v.split('-')[0]),int((v.split('-')[-1]))
                    for index in range(start_page,end_page+1):
                        page=doc.getPage(index)
                        pdf.addPage(page)
                    if os.path.exists(os.path.join(savepath,k.replace('#',' ')+'.pdf')):
                        os.remove(os.path.join(savepath,k.replace('#',' ')+'.pdf'))
                    with open(os.path.join(savepath,k.replace('#',' ')+'.pdf'),'wb') as f:
                        pdf.write(f)
                    f.close()
                print('拆分pdf结束!')
            except Exception as e:
                print('拆分pdf文件=',path_file,'失败!')
                print(e)

            # print(dic)
            ##################拆分pdf文件--end#########################################
            ##################生成拆分结果清单--开始############################################
            # content=[]
            print('*'*30)
            print('生成拆分结果清单开始!')
            try:
                if os.path.exists(os.path.join(path,'拆分结果清单.xlsx')):
                    os.remove(os.path.join(path,'拆分结果清单.xlsx'))
                wb=opl.Workbook()
                ws=wb.create_sheet('Res')
                ws.append(('学号','姓名','文件链接','收件人(自行录入)','方式(自行录入)'))
                for k,v in dic.items():
                    t=(
                        k.split('#')[0],
                        k.split('#')[-1],
                        '=hyperlink("'+os.path.join(savepath,k.replace('#',' ')+'.pdf')+'")',
                        '',
                    '')
                    ws.append(t)
                    # print(content)
                # ws.append(content)
                wb.save(os.path.join(os.getcwd(),'拆分结果清单.xlsx'))
            except Exception as e:
                print('生成拆分清单失败!请检查是否存在未关闭的“拆分结果清单.xlsx”文件!')
                print(e)
            print('生成拆分清单结束!')
            ##################

    return 1
Esempio n. 26
0
def read_in_paychecks(filepaths='', password='', parser=paycheck_parser, cache=True):
    """
    Read in all the paychecks from a directory full of PDFs and return a DataFrame. If a password is supplied encrypted PDFs
    *can* be read. PDFs are converted to text lines, which are assumed to be mostly tabular and converted to lists of lists
    using multiple spaces as elimiters. Since PDFs are unstructured the parsing function will almost definetly need to be
    overriden by the user.

    Note:
    Assumes PDF file names contain date.

    Example:
    ```
    paychecks = read_in_paychecks('/path/to/paycheck/directory/*.pdf', password='******', parser=paycheck_parser)
    ```
    """

    # Get PDFs from directory and check for cached file
    paycheckfiles = glob.glob(filepaths)
    paycheck_cache_file = os.path.dirname(filepaths) + '.csv'
    cached = os.path.exists(paycheck_cache_file)

    # Read in cached file if it exists
    if cache and cached:
        paycheck_df = read_date_csv_file(paycheck_cache_file)

    # Read paycheck data if need be (not cached or new paycheck)
    if not cache or not cached or len(paycheckfiles) > len(paycheck_df):
        # Read in paycheck data to dictionary
        paycheck_dict = {}
        for paycheckfile in paycheckfiles:

            # Open a PDF file
            fp = open(paycheckfile, 'rb')
            # Get the date
            date = DATE_RE.findall(paycheckfile)[0]

            # Create string to put PDF
            output = cStringIO.StringIO()

            # Create a PDF parser object associated with the file object.
            pdfparser = PDFParser(fp)

            # Create a PDF document object that stores the document structure. Supply the password for initialization.
            document = PDFDocument(pdfparser, password)

            # Check if the document allows text extraction. If not, abort.
            if not document.is_extractable:
                raise PDFTextExtractionNotAllowed

            # Create a PDF resource manager object that stores shared resources.
            manager = PDFResourceManager()

            # Create a PDF converter object.
            converter = TextConverter(manager, output, laparams=LAParams())

            # Create a PDF interpreter object.
            interpreter = PDFPageInterpreter(manager, converter)

            # Process each page contained in the document.
            pages = list(PDFPage.create_pages(document))
            interpreter.process_page(pages[0])

            # Get text
            text = output.getvalue()

            # Close up file objects
            pdfparser.close()
            fp.close()
            converter.close()
            output.close()

            # Add to dictionary
            paycheck_dict[date] = text

        # Parse paycheck data with user defined function
        paycheck_df = parser(paycheck_dict)

        # Enforce pennies
        paycheck_df = paycheck_df.fillna(0.0).round(2)

        if cache:
            paycheck_df.to_csv(paycheck_cache_file)

    return paycheck_df