def check_pdf(file): failed = False print "check_pdf:", file try: fp = open(file, 'rb') parser = PDFParser(fp) document = PDFDocument(parser) print "extractable:", document.is_extractable, ",modifiable:", document.is_modifiable, ", printable:", document.is_printable succed_files.put(file) succed_path = os.path.split(file)[0] succed_mark_file = os.path.join(succed_path, "success_mark.txt") f = open(succed_mark_file, "w") f.close() print "succed mark file generated:", succed_mark_file except: traceback.print_exc() failed_files.put(file) failed = True # print "move fail file to dir", fail_dir, ",", file # shutil.move(file, fail_dir) # 必须先关闭文件才能移动,不然报错 os.unlink(src) WindowsError: [Error 32] http://jining2593.blog.163.com/blog/static/2770148420101024114428257/ finally: parser.close() fp.close() if failed == True: print "move fail file to dir", fail_dir, ",", file shutil.move(file, fail_dir) print "all:", len(all_pdf_files), ",succed:", succed_files.qsize( ), ",failed:", failed_files.qsize()
def with_pdf (pdf_doc, fn, pdf_pwd, *args): """Open the pdf document, and apply the function, returning the results""" result = None try: # open the pdf file fp = open(pdf_doc, 'rb') # create a parser object associated with the file object parser = PDFParser(fp) # create a PDFDocument object that stores the document structure doc = PDFDocument() # connect the parser and document objects parser.set_document(doc) doc.set_parser(parser) # supply the password for initialization doc.initialize(pdf_pwd) if doc.is_extractable: # apply the function and return the result result = fn(doc, *args) # close the pdf file parser.close() fp.close() except IOError: # the file doesn't exist or similar problem pass return result
def getData(fileName): doc = PDFDocument() fp = file(fileName, 'rb') parser = PDFParser(fp) try: parser.set_document(doc) doc.set_parser(parser) except: return "error" parser.close() fp.close() try: for xref in doc.xrefs: info_ref=xref.trailer.get('Info') if info_ref: info=resolve1(info_ref) metadata=info if metadata == None: return "Empty metadata" else: if metadata.has_key('Author'): print("Author "+metadata['Author']) if metadata.has_key('Company'): print("Company "+metadata['Company']) if metadata.has_key('Producer'): print("Producer "+metadata['Producer']) if metadata.has_key('Creator'): print("Creator "+metadata['Creator']) except Exception,e: print "\t [x] Error in PDF extractor" return e
def getData(self): doc = PDFDocument() fp = file(self.fname, 'rb') parser = PDFParser(fp) try: parser.set_document(doc) doc.set_parser(parser) doc.initialize(self.password) except: return "error" parser.close() fp.close() #try: # metadata = resolve1(doc.catalog['Metadata']) # return "ok" #except: # print "[x] Error in PDF extractor, Metadata catalog" try: for xref in doc.xrefs: info_ref = xref.trailer.get('Info') if info_ref: info = resolve1(info_ref) self.metadata = info self.raw = info if self.raw == None: return "Empty metadata" else: return "ok" except Exception, e: return e print "[x] Error in PDF extractor, Trailer Info"
def dumpoutline(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None): doc = PDFDocument() fp = file(fname, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize(password) pages = dict( (page.pageid, pageno) for (pageno, page) in enumerate(doc.get_pages())) for (level, title, dest, a, se) in doc.get_outlines(): pageno = None if dest: dest = resolve1(doc.lookup_name('Dests', dest)) if isinstance(dest, dict): dest = dest['D'] pageno = pages[dest[0].objid] elif a: action = a.resolve() if isinstance(action, dict): subtype = action.get('S') if subtype and repr(subtype) == '/GoTo' and action.get('D'): dest = action['D'] pageno = pages[dest[0].objid] outfp.write(repr((level, title, dest, pageno)) + '\n') parser.close() fp.close() return
def getData(self): doc = PDFDocument() fp = file(self.fname, 'rb') parser = PDFParser(fp) try: parser.set_document(doc) doc.set_parser(parser) doc.initialize(self.password) except: return "error" parser.close() fp.close() #try: # metadata = resolve1(doc.catalog['Metadata']) # return "ok" #except: # print "[x] Error in PDF extractor, Metadata catalog" try: for xref in doc.xrefs: info_ref=xref.trailer.get('Info') if info_ref: info=resolve1(info_ref) self.metadata=info self.raw = info if self.raw == None: return "Empty metadata" else: return "ok" except Exception,e: return e print "\t [x] Error in PDF extractor, Trailer Info"
def with_pdf(pdf_doc, fn, pdf_pwd, *args): """Open the pdf document, and apply the function, returning the results""" result = None try: # open the pdf file fp = open(pdf_doc, 'rb') # create a parser object associated with the file object parser = PDFParser(fp) # create a PDFDocument object that stores the document structure doc = PDFDocument() # connect the parser and document objects parser.set_document(doc) doc.set_parser(parser) # supply the password for initialization doc.initialize(pdf_pwd) if doc.is_extractable: # apply the function and return the result result = fn(doc, *args) # close the pdf file parser.close() fp.close() except IOError: # the file doesn't exist or similar problem pass return result
def dumpoutline(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None): doc = PDFDocument() fp = file(fname, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize(password) pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) ) for (level,title,dest,a,se) in doc.get_outlines(): pageno = None if dest: dest = resolve1( doc.lookup_name('Dests', dest) ) if isinstance(dest, dict): dest = dest['D'] pageno = pages[dest[0].objid] elif a: action = a.resolve() if isinstance(action, dict): subtype = action.get('S') if subtype and repr(subtype) == '/GoTo' and action.get('D'): dest = action['D'] pageno = pages[dest[0].objid] outfp.write(repr((level,title,dest,pageno))+'\n') parser.close() fp.close() return
def dumpoutline( outfp, fname, objids, pagenos, password="", dumpall=False, codec=None, extractdir=None, ): fp = open(fname, "rb") parser = PDFParser(fp) doc = PDFDocument(parser, password) pages = { page.pageid: pageno for (pageno, page) in enumerate(PDFPage.create_pages(doc), 1) } def resolve_dest(dest): if isinstance(dest, str): dest = resolve1(doc.get_dest(dest)) elif isinstance(dest, PSLiteral): dest = resolve1(doc.get_dest(dest.name)) if isinstance(dest, dict): dest = dest["D"] if isinstance(dest, PDFObjRef): dest = dest.resolve() return dest try: outlines = doc.get_outlines() outfp.write("<outlines>\n") for (level, title, dest, a, se) in outlines: pageno = None if dest: dest = resolve_dest(dest) pageno = pages[dest[0].objid] elif a: action = a if isinstance(action, dict): subtype = action.get("S") if subtype and repr(subtype) == "/'GoTo'" and action.get("D"): dest = resolve_dest(action["D"]) pageno = pages[dest[0].objid] s = e(title).encode("utf-8", "xmlcharrefreplace") outfp.write('<outline level="{!r}" title="{}">\n'.format(level, s)) if dest is not None: outfp.write("<dest>") dumpxml(outfp, dest) outfp.write("</dest>\n") if pageno is not None: outfp.write("<pageno>%r</pageno>\n" % pageno) outfp.write("</outline>\n") outfp.write("</outlines>\n") except PDFNoOutlines: pass parser.close() fp.close() return
def dumpoutline(outfp: TextIO, fname: str, objids: Any, pagenos: Container[int], password: str = '', dumpall: bool = False, codec: Optional[str] = None, extractdir: Optional[str] = None) -> None: fp = open(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) pages = { page.pageid: pageno for (pageno, page) in enumerate(PDFPage.create_pages(doc), 1) } def resolve_dest(dest: object) -> Any: if isinstance(dest, (str, bytes)): dest = resolve1(doc.get_dest(dest)) elif isinstance(dest, PSLiteral): dest = resolve1(doc.get_dest(dest.name)) if isinstance(dest, dict): dest = dest['D'] if isinstance(dest, PDFObjRef): dest = dest.resolve() return dest try: outlines = doc.get_outlines() outfp.write('<outlines>\n') for (level, title, dest, a, se) in outlines: pageno = None if dest: dest = resolve_dest(dest) pageno = pages[dest[0].objid] elif a: action = a if isinstance(action, dict): subtype = action.get('S') if subtype and repr(subtype) == '/\'GoTo\'' and action.get( 'D'): dest = resolve_dest(action['D']) pageno = pages[dest[0].objid] s = escape(title) outfp.write('<outline level="{!r}" title="{}">\n'.format(level, s)) if dest is not None: outfp.write('<dest>') dumpxml(outfp, dest) outfp.write('</dest>\n') if pageno is not None: outfp.write('<pageno>%r</pageno>\n' % pageno) outfp.write('</outline>\n') outfp.write('</outlines>\n') except PDFNoOutlines: pass parser.close() fp.close() return
def dumpoutline(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None, extractdir=None): fp = open(fname, 'rb') parser = PDFParser(fp) doc = PDFDocument(parser, password) pages = dict((page.pageid, pageno) for (pageno, page) in enumerate(PDFPage.create_pages(doc), 1)) def resolve_dest(dest): if isinstance(dest, str): dest = resolve1(doc.get_dest(dest)) elif isinstance(dest, PSLiteral): dest = resolve1(doc.get_dest(dest.name)) if isinstance(dest, dict): dest = dest['D'] if isinstance(dest, PDFObjRef): dest = dest.resolve() return dest try: outlines = doc.get_outlines() outfp.write('<outlines>\n') for (level, title, dest, a, se) in outlines: pageno = None if dest: dest = resolve_dest(dest) pageno = pages[dest[0].objid] elif a: action = a if isinstance(action, dict): subtype = action.get('S') if subtype and repr(subtype) == '/\'GoTo\'' and action.get( 'D'): dest = resolve_dest(action['D']) pageno = pages[dest[0].objid] s = e(title).encode('utf-8', 'xmlcharrefreplace') outfp.write('<outline level="%r" title="%s">\n' % (level, s)) if dest is not None: outfp.write('<dest>') dumpxml(outfp, dest) outfp.write('</dest>\n') if pageno is not None: outfp.write('<pageno>%r</pageno>\n' % pageno) outfp.write('</outline>\n') outfp.write('</outlines>\n') except PDFNoOutlines: pass parser.close() fp.close() return
def extract_TOC(pdf_path): fp = open(pdf_path, 'rb') parser = PDFParser(fp) document = PDFDocument(parser, b"") pages = {page.pageid: pageno for (pageno, page) in enumerate(PDFPage.create_pages(document), 1)} def resolve_dest(dest): if isinstance(dest, str): dest = resolve1(document.get_dest(dest)) elif isinstance(dest, PSLiteral): dest = resolve1(document.get_dest(dest.name)) if isinstance(dest, dict): dest = dest['D'] if isinstance(dest, PDFObjRef): dest = dest.resolve() return dest toc = "" try: outlines = document.get_outlines() toc += '<outlines>\n' for (level, title, dest, a, se) in tqdm(outlines, leave=False): pageno = None if dest: dest = resolve_dest(dest) # Very imperative and can cause errors that are hard to debug since we overwrite pageno = pages[dest[0].objid] elif a: action = a if isinstance(action, dict): subtype = action.get("S") if subtype and repr(subtype) == "/'GoTo'" and action.get("D"): dest = resolve_dest(action.get("D")) pageno = pages[dest[0].objid] string = escape_str(title).encode("utf-8", "xmlcharrefreplace") toc += '<outline level="{!r}" title="{}">\n'.format(level, string) if dest is not None: toc += "<dest>" toc = dumpxml(toc, dest) toc += "</dest>\n" if pageno is not None: toc += "<pageno>{}</pageno>\n".format(pageno) toc += "</outline>\n" toc += "</outlines>\n" except PDFNoOutlines: pass parser.close() fp.close() return toc
def succ_test(): try: os.chdir(r'F:\allitebooks\making-games') fp = open('Making Games.pdf', 'rb') parser = PDFParser(fp) document = PDFDocument(parser) print "extractable:", document.is_extractable, ",modifiable:", document.is_modifiable, ", printable:", document.is_printable outlines = document.get_outlines() print outlines except: traceback.print_exc() finally: parser.close() fp.close()
def get_module_codes(url): r = requests.get(url, stream=True) with open("temp/" + url[-20:], 'wb') as f: f.write(r.content) with open("temp/" + url[-20:], 'rb') as f: parser = PDFParser(f) document = PDFDocument(parser) # Create PDFResourceManager object that stores shared resources such as fonts or images resource_manager = PDFResourceManager() la_params = LAParams() # Extract the device to page aggregator to get LT object elements device = PDFPageAggregator(resource_manager, laparams=la_params) # Interpreter needs to be connected to resource manager for shared resources and device interpreter = PDFPageInterpreter(resource_manager, device) module_codes = [] for page in PDFPage.create_pages(document): first = True interpreter.process_page(page) # The device renders the layout from interpreter layout = device.get_result() for lt_obj in layout: if isinstance(lt_obj, LTTextBox) or isinstance( lt_obj, LTTextLine): text = lt_obj.get_text().strip() if re.match("\d\w{2}\d{3}", text): if len(text) > 6: print("],") parser.close() return module_codes if not first: print(", ", end=''), else: first = False print("\"" + text + "\"", end=''), parser.close() return module_codes
def dumpoutline(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None): doc = PDFDocument() fp = file(fname, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize(password) pages = dict((page.pageid, pageno) for (pageno, page) in enumerate(doc.get_pages())) def resolve_dest(dest): if isinstance(dest, str): dest = resolve1(doc.get_dest(dest)) elif isinstance(dest, PSLiteral): dest = resolve1(doc.get_dest(dest.name)) if isinstance(dest, dict): dest = dest['D'] return dest try: outlines = doc.get_outlines() outfp.write('<outlines>\n') for (level, title, dest, a, se) in outlines: pageno = None if dest: dest = resolve_dest(dest) pageno = pages[dest[0].objid] elif a: action = a.resolve() if isinstance(action, dict): subtype = action.get('S') if subtype and repr(subtype) == '/GoTo' and action.get('D'): dest = resolve_dest(action['D']) pageno = pages[dest[0].objid] s = e(title).encode('utf-8', 'xmlcharrefreplace') outfp.write('<outline level="%r" title="%s">\n' % (level, s)) if dest is not None: outfp.write('<dest>') dumpxml(outfp, dest) outfp.write('</dest>\n') if pageno is not None: outfp.write('<pageno>%r</pageno>\n' % pageno) outfp.write('</outline>\n') outfp.write('</outlines>\n') except PDFNoOutlines: pass parser.close() fp.close() return
def dumpoutline(outfp, fname, objids, pagenos, password="", dumpall=False, codec=None): fp = file(fname, "rb") parser = PDFParser(fp) doc = PDFDocument(parser) doc.initialize(password) pages = dict((page.pageid, pageno) for (pageno, page) in enumerate(PDFPage.create_pages(doc))) def resolve_dest(dest): if isinstance(dest, str): dest = resolve1(doc.get_dest(dest)) elif isinstance(dest, PSLiteral): dest = resolve1(doc.get_dest(dest.name)) if isinstance(dest, dict): dest = dest["D"] return dest try: outlines = doc.get_outlines() outfp.write("<outlines>\n") for (level, title, dest, a, se) in outlines: pageno = None if dest: dest = resolve_dest(dest) pageno = pages[dest[0].objid] elif a: action = a.resolve() if isinstance(action, dict): subtype = action.get("S") if subtype and repr(subtype) == "/GoTo" and action.get("D"): dest = resolve_dest(action["D"]) pageno = pages[dest[0].objid] s = e(title).encode("utf-8", "xmlcharrefreplace") outfp.write('<outline level="%r" title="%s">\n' % (level, s)) if dest is not None: outfp.write("<dest>") dumpxml(outfp, dest) outfp.write("</dest>\n") if pageno is not None: outfp.write("<pageno>%r</pageno>\n" % pageno) outfp.write("</outline>\n") outfp.write("</outlines>\n") except PDFNoOutlines: pass parser.close() fp.close() return
def dumpoutline(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None): doc = PDFDocument() fp = file(fname, 'rb') parser = PDFParser(doc, fp) doc.initialize(password) pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) ) for (level,title,dest,a,se) in doc.get_outlines(): pageno = None if dest: dest = resolve1( doc.lookup_name('Dests', dest) ) if isinstance(dest, dict): dest = dest['D'] pageno = pages[dest[0].objid] outfp.write(repr((level,title,dest,pageno))+'\n') parser.close() fp.close() return
def getData(self): try: doc = PDFDocument() fp = file(self.fname, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize(self.password) metadata = resolve1(doc.catalog['Metadata']) parser.close() fp.close() for xref in doc.xrefs: info_ref = xref.trailer.get('Info') if info_ref: info = resolve1(info_ref) self.metadata = info self.raw = info return "ok" except: return "error"
def getData(self): try: doc = PDFDocument() fp = file(self.fname, 'rb') parser = PDFParser(fp) parser.set_document(doc) doc.set_parser(parser) doc.initialize(self.password) metadata = resolve1(doc.catalog['Metadata']) parser.close() fp.close() for xref in doc.xrefs: info_ref=xref.trailer.get('Info') if info_ref: info=resolve1(info_ref) self.metadata=info self.raw = info return "ok" except: return "error"
def addpdf(self): """ Add a pdf to the list of pdf to be converted. ( See Queue class ) """ self.controller.logger.info("addpdf is called.") filename = tkFileDialog.askopenfilename( initialdir="/", title="Select file", filetypes=(("pdf files", "*.pdf"), ("all files", "*.*"))) fp = open(str(filename), 'rb') parser = PDFParser(fp) document = PDFDocument(parser) num_pages = 0 for page in PDFPage.create_pages(document): num_pages += 1 parser.close() fp.close() self.Queue.add_pdf(filename, num_pages) pdfs = "" for e in range(len(self.Queue.queue)): pdfs += str(self.Queue.queue[e]) + "\n" self.label10 = tk.Label(self, text=str(pdfs), width=70, height=10, borderwidth=1, relief="groove", font=("Verdana", 8, "bold"), fg="dark slate gray").place(x=260, y=170) self.controller.logger.info("a pdf has been added.")
def parse_data(self, path, filetype, **kwargs): self.filename = path self.metadata = {} if not filetype == FileTypes.PDF: return None with open(self.filename, 'rb') as fp: parser = PDFParser(fp) doc = PDFDocument(parser) if doc: try: for xref in doc.xrefs: info_ref = xref.trailer.get('Info') info = None if info_ref: info = resolve1(info_ref) self.metadata = info for k, v in info.items(): if isinstance(v, PDFObjRef): self.metadata[k] = resolve1(v) break if not self.metadata: self.errors.append('No metadata found') out = None else: self._parse_data() out = self except Exception as e: self.logger.error(str(e)) self.errors.append(str(e)) out = None else: self.errors.append('Cannot parse document') parser.close() return out
def dumpoutline(outfp, fname, objids, pagenos, password='', dumpall=False, codec=None): doc = PDFDocument() fp = file(fname, 'rb') parser = PDFParser(doc, fp) doc.initialize(password) pages = dict( (page.pageid, pageno) for (pageno, page) in enumerate(doc.get_pages())) for (level, title, dest, a, se) in doc.get_outlines(): pageno = None if dest: dest = resolve1(doc.lookup_name('Dests', dest)) if isinstance(dest, dict): dest = dest['D'] pageno = pages[dest[0].objid] outfp.write(repr((level, title, dest, pageno)) + '\n') parser.close() fp.close() return
def process_pdf(file_name, type): # Open a PDF file print('reading from', file_name) if (type == FileType.ONLINE): url_name = file_name file_name = file_name.split('/')[-1] if not os.path.exists(file_name): try: wget.download(url_name) print() except urllib.error.HTTPError as err: print(err) return {} # return an empty dictionary # since we are using parallel programming two files might end up having the same name # therefore we change the files into some random name as they will be deleted anyways if FileType.ONLINE: temp_name = f'{str(random.randint(1,2000))}.pdf' os.rename(file_name, temp_name) file_name = temp_name fp = open(file_name, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Set parameters for analysis. laparams = LAParams() # Create a PDF page aggregator object. device = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # Store Information and Data courses = {} # store courses info in a dictionary # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page) # receive the LTPage object for the page. layout = device.get_result() elements = [] for element in layout: elements.append(element) with concurrent.futures.ThreadPoolExecutor() as executor: for element in elements: executor.submit(process_element, args=[element, courses]) parser.close() fp.close() if type == FileType.ONLINE: os.remove(file_name) # remove file after processing # print(len(courses),'courses has been extracted...\n') return courses
class PdfElementIndexer(AbstractIndexTask): def __init__(self): self.log = logging.getLogger('django') self.limit = None self.catalogs = None self.aws_url = '{0}/{1}/{2}.pdf'.format(S3['endpoint'], S3['pdf_bucket'], '{0}') self._resource_mgr = PDFResourceManager() self._layout_params = LAParams() self.parser = None self.temp = None def _fetch_document(self, cl_id): url = self.aws_url.format(cl_id) self.log.debug('Retrieving PDF URL [{0}]'.format(url)) req = requests.get(url, stream=True) if req.status_code >= 400: raise requests.RequestException( 'URL [{0}] return status [{1}]'.format(url, req.status_code)) req.raw.decode_content = True self.temp = tempfile.TemporaryFile() self.temp.write(req.raw.data) self.temp.seek(0) self.log.info( 'Successful Retrieval and temporary file creation. Initializing PDF Extraction' ) self.log.info('Initializing PDF Parser') self.parser = PDFParser(self.temp) self.log.info('Initializing PDF Document') doc = PDFDocument(self.parser) self.log.info('Linking Document and Parser') self.parser.set_document(doc) req.close() return doc def parse_lt_objects(self, layout, index, text=[]): """ Iterates over a list of LT* objects and captures the text contained within, Images are skipped :param layout: List of LT* objects retrieved from the PDFPage instance :param index: Current page number :return: String of text """ self.log.debug('Processing LT objects for page [{0}]'.format(index)) text_content = [] page_text = { } # k = (x0, x1) of bounding box, v = list of text strings within that column for obj in layout: if isinstance(obj, LTTextBox) or isinstance(obj, LTTextLine): self.log.debug('[{0}] object found'.format(type(obj))) page_text = self._update_text_hash(page_text, obj) elif isinstance(obj, LTFigure): # LTFigure objects are containers for other LT* objects, so recurse through children self.log.debug( 'LTFigure object found, recursing to process children nodes' ) text_content.append( self.parse_lt_objects(obj, index, text_content)) self.log.debug('Page [{0}] extracted'.format(index)) return page_text def _update_text_hash(self, text, obj, pct=0.2): """ Use the bbox x0, x1 values within :param pct to produce lists of associated text within the hash :param text: dict of page text in the format {(x0, x1) : [list of strings in that column] :param lt_obj: LineText object :return: hash of text values mapped to bounding boxes """ x_0 = obj.bbox[0] x_1 = obj.bbox[1] key_found = False self.log.debug('Updating page text hash for bbox [({0}, {1})]'.format( x_0, x_1)) for k, v in text.items(): hash_x0 = k[0] if x_0 >= (hash_x0 * (1.0 - pct)) and (hash_x0 * (1.0 + pct)) >= x_0: hash_x1 = k[1] if x_1 >= (hash_x1 * (1.0 - pct)) and (hash_x1 * (1.0 + pct)) >= x_1: # text inside this LT object was positioned at the same width as a prior series of text, so it # belongs together key_found = True v.append(self._remove_non_ascii(obj.get_text())) text[k] = v self.log.debug('BBox [{0}, {1}] text updated'.format( x_0, x_1)) if not key_found: # Based on width of bounding box, this text is a new series, so it gets its own entry in the hash text[(x_0, x_1)] = [self._remove_non_ascii(obj.get_text())] self.log.debug('Created new hash key for bbox [{0}, {1}]'.format( x_0, x_1)) return text def _parse_pages(self, document): """ With an open PDFDocument object, get the pages and parse each one. This is a higher order function to be passed in the run() method as the fn parameter :param document: PDFDocument object :return: list of text extracted """ self.log.info('Initializing Page Aggregator') device = PDFPageAggregator(self._resource_mgr, laparams=self._layout_params) self.log.info('Initializing Page Interpreter') interpreter = PDFPageInterpreter(self._resource_mgr, device) text_content = [] for idx, page in enumerate(PDFPage.create_pages(document)): self.log.debug('Interpreter processing page [{0}]'.format(idx)) interpreter.process_page(page) self.log.debug('Retrieved LTPage object for page') layout = device.get_result() text_content.append(self.parse_lt_objects(layout, idx)) self.log.info( 'Successfully completed text extraction of [{0}] pages'.format( len(text_content))) return text_content def _to_bytestring(self, string, encode='utf-8'): """ Convert a given unicode string to a byte string, using standard encoding. :param string: Unicode string :param encode: Encoding Format :return: bytestring encoded in :param encode forma """ if string: if isinstance(string, str): return string else: return string.encode(encode) def _remove_non_ascii(self, s): # Project uses Python 2.7, which comes with a host of unicode issues. This attempts to sidestep, as we are # concentrating only on English-language documents. try: return u"".join(i for i in s if ord(i) < 128 and ( ord(i) >= 32 or ord(i) == 9 or ord(i) == 10 or ord(i) == 13)) except Exception: return "" def _close(self): if self.parser: self.parser.close() if self.temp: self.temp.close() def _save_state(self, cl_id, pdf_validate_status, index_status, documents_indexed=0, index_message=''): # We need data in the main application MySQL db updated to reflect the text extraction and indexing status validate_catalog, created = PdfValidateCatalog.objects.get_or_create( catalog__link_id=cl_id) validate_catalog.index_status = index_status validate_catalog.message = index_message validate_catalog.documents_indexed = documents_indexed validate_catalog.save() self._close() self.log.info('Saving state with message: [{0}]'.format(index_message)) return validate_catalog def _save_to_db(self, data, cl_id, catalog_year, institution): # One of two save methods, saves to a relational database configured and optimized for text search self.log.info('Starting saving data to database') document_list = [] indexed_date = datetime.now().strftime('%c') page_count = 0 for idx, val in enumerate(data): if val != '': page_count += 1 self.log.info('Indexing page [{0}]'.format(idx)) for k, v in val.iteritems(): section_text = '\n'.join(v) section_id = sha512(section_text).hexdigest() entry = PdfIndexDocument( hash_id=section_id, page=page_count, bounds=repr(k), content=section_text.decode('utf-8'), catalog_link=cl_id, catalog_year=catalog_year, institution=institution, indexed_date=indexed_date) document_list.append(entry) PdfIndexDocument.objects.bulk_create(document_list) return len(document_list) def _solr(self, data, cl_id, catalog_year, institution, soft_commit=True): # The second of two save methods, saves to a Solr server self.log.info( 'Starting SOLR indexing with instance URL [{0}]'.format(SOLR)) solr_instance = pysolr.Solr(SOLR, timeout=360) page_count = 0 solr_data = [] indexed_date = datetime.now().strftime('%c') for idx, val in enumerate(data): if val != '': page_count += 1 self.log.debug('Indexing page [{0}]'.format(idx)) for k, v in val.iteritems(): if type( k ) is tuple: # Ensure key is always tuple to be iterated over section_text = '\n'.join(v) section_id = sha512(section_text).hexdigest() solr_data.append({ 'id': section_id, 'page': page_count, 'bounds': repr(k), 'content': section_text.decode('utf-8'), 'catalog_link': cl_id, 'catalog_year': catalog_year, 'institution': institution, 'indexed_date': indexed_date }) self.log.info('Committing [{0}] pages of content'.format(page_count)) solr_instance.delete(q='catalog_link:{0}'.format(cl_id)) solr_instance.add(solr_data, waitSearcher=True) return len(solr_data) def on_failure(self, exc, task_id, args, kwargs, einfo): self.log.error( 'Error for task [{0}] in indexing PDF document [{1}]'.format( task_id, args[0])) self.log.error('Einfo: [{0}]'.format(einfo)) def run(self, cl_id, catalog_year, institution, db_insert=False, soft_commit=True): """ Main run method for this Celery task. For a provided :param cl_id, the associated PDF file will be retrieved from S3 for text extraction and insert to the SOLR server for search and data retrieval. :param cl_id: CatalogLink ID for PDF document to be indexed :param catalog_year String for catalog year :param institution String for institution name :param soft_commit SoftCommit for Solr, default = True True will refresh the view of the index in a more performant manner, without on-disk guarantees :return: None """ start_time = datetime.now() try: pdf_fetch_start = datetime.now() pdf_doc = self._fetch_document(cl_id) pdf_fetch_elapsed = datetime.now() - pdf_fetch_start self.log.info('PDF Initialization elapsed time: [{0}]'.format( pdf_fetch_elapsed)) pdf_parse_start = datetime.now() if pdf_doc.is_extractable: text = self._parse_pages(pdf_doc) self.log.info( 'PDF Parsing elapsed time: [{0}]'.format(datetime.now() - pdf_parse_start)) else: raise PDFTextExtractionNotAllowed( 'File [{0}.pdf] is not extractable to a PDF document'. format(cl_id)) if db_insert: self.log.info('Inserting to database') documents_indexed = self._save_to_db(text, cl_id, catalog_year, institution) else: self.log.info('Inserting to Solr') documents_indexed = self._solr(text, cl_id, catalog_year, institution, soft_commit) self.log.info( 'Total elapsed processing time: [{0}]'.format(datetime.now() - start_time)) self._save_state(cl_id, 1, 1, documents_indexed) return { 'state': states.SUCCESS, 'documents_indexed': documents_indexed } except (requests.RequestException, PDFException, ValueError, Exception) as e: self.log.error('{0} - {1}'.format(e, e.message)) self._save_state(cl_id, 1, -1, index_message=e.message) raise e
def main(path): files = os.listdir(path) # print(files) dic={} for file in files: if file.lower().endswith('.pdf'): L=[] path_file=os.path.join(path,file) print('当前处理=',path_file) ##########################提取学生信息部分--start################## print('*'*30) print('解析pdf开始') parser=PDFParser(open(path_file,'rb')) doc=PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize() if doc.is_extractable: doc_resource=PDFResourceManager() doc_device=LAParams() doc_resource_device=PDFPageAggregator(doc_resource,laparams=doc_device) doc_interpreter=PDFPageInterpreter(doc_resource,doc_resource_device) for page in doc.get_pages(): # result='' print('exec page') doc_interpreter.process_page(page) layout=doc_resource_device.get_result() for x in layout: print(type(x)) if isinstance(x,LTTextBoxHorizontal): result=x.get_text().replace('\n','') print(result) if result.find('学号')>=0 and result.find('姓名')>=0: xh=result.split('学号')[-1].split('姓名')[0] xm=result.split('姓名')[-1].split('性别')[0] L.append(xh+'#'+xm) else: print("x is not LTTextBox") else: print(file,'is Error!') parser.close() #########################提取学生信息部分--end############################ #########################生成学生页码信息部分---start###################### for index in range(len(L)): if L[index] not in dic.keys(): dic[L[index]]=str(L.index(L[index]))+'-'+str(index+L.count(L[index])-1) ########################处理学生页码信息部分----end######################### print('解析pdf结束。') print('拆分pdf开始!') ########################拆分pdf文件--start################################ savepath = os.path.join(path, '结果') try: doc=PdfFileReader(open(path_file,'rb')) for k,v in dic.items(): pdf=PdfFileWriter() start_page,end_page=int(v.split('-')[0]),int((v.split('-')[-1])) for index in range(start_page,end_page+1): page=doc.getPage(index) pdf.addPage(page) if os.path.exists(os.path.join(savepath,k.replace('#',' ')+'.pdf')): os.remove(os.path.join(savepath,k.replace('#',' ')+'.pdf')) with open(os.path.join(savepath,k.replace('#',' ')+'.pdf'),'wb') as f: pdf.write(f) f.close() print('拆分pdf结束!') except Exception as e: print('拆分pdf文件=',path_file,'失败!') print(e) # print(dic) ##################拆分pdf文件--end######################################### ##################生成拆分结果清单--开始############################################ # content=[] print('*'*30) print('生成拆分结果清单开始!') try: if os.path.exists(os.path.join(path,'拆分结果清单.xlsx')): os.remove(os.path.join(path,'拆分结果清单.xlsx')) wb=opl.Workbook() ws=wb.create_sheet('Res') ws.append(('学号','姓名','文件链接','收件人(自行录入)','方式(自行录入)')) for k,v in dic.items(): t=( k.split('#')[0], k.split('#')[-1], '=hyperlink("'+os.path.join(savepath,k.replace('#',' ')+'.pdf')+'")', '', '') ws.append(t) # print(content) # ws.append(content) wb.save(os.path.join(os.getcwd(),'拆分结果清单.xlsx')) except Exception as e: print('生成拆分清单失败!请检查是否存在未关闭的“拆分结果清单.xlsx”文件!') print(e) print('生成拆分清单结束!') ################## return 1
def read_in_paychecks(filepaths='', password='', parser=paycheck_parser, cache=True): """ Read in all the paychecks from a directory full of PDFs and return a DataFrame. If a password is supplied encrypted PDFs *can* be read. PDFs are converted to text lines, which are assumed to be mostly tabular and converted to lists of lists using multiple spaces as elimiters. Since PDFs are unstructured the parsing function will almost definetly need to be overriden by the user. Note: Assumes PDF file names contain date. Example: ``` paychecks = read_in_paychecks('/path/to/paycheck/directory/*.pdf', password='******', parser=paycheck_parser) ``` """ # Get PDFs from directory and check for cached file paycheckfiles = glob.glob(filepaths) paycheck_cache_file = os.path.dirname(filepaths) + '.csv' cached = os.path.exists(paycheck_cache_file) # Read in cached file if it exists if cache and cached: paycheck_df = read_date_csv_file(paycheck_cache_file) # Read paycheck data if need be (not cached or new paycheck) if not cache or not cached or len(paycheckfiles) > len(paycheck_df): # Read in paycheck data to dictionary paycheck_dict = {} for paycheckfile in paycheckfiles: # Open a PDF file fp = open(paycheckfile, 'rb') # Get the date date = DATE_RE.findall(paycheckfile)[0] # Create string to put PDF output = cStringIO.StringIO() # Create a PDF parser object associated with the file object. pdfparser = PDFParser(fp) # Create a PDF document object that stores the document structure. Supply the password for initialization. document = PDFDocument(pdfparser, password) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. manager = PDFResourceManager() # Create a PDF converter object. converter = TextConverter(manager, output, laparams=LAParams()) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(manager, converter) # Process each page contained in the document. pages = list(PDFPage.create_pages(document)) interpreter.process_page(pages[0]) # Get text text = output.getvalue() # Close up file objects pdfparser.close() fp.close() converter.close() output.close() # Add to dictionary paycheck_dict[date] = text # Parse paycheck data with user defined function paycheck_df = parser(paycheck_dict) # Enforce pennies paycheck_df = paycheck_df.fillna(0.0).round(2) if cache: paycheck_df.to_csv(paycheck_cache_file) return paycheck_df