def parse_pdf_pdfminer(self, f, fpath): try: list_pages = [] laparams = LAParams() laparams.all_texts = True rsrcmgr = PDFResourceManager() pagenos = set() if self.dedup: self.dedup_store = set() self.handler.print_header(fpath) page_num = 0 for page in PDFPage.get_pages(f, pagenos, check_extractable=True): page_num += 1 retstr = StringIO() device = TextConverter(rsrcmgr, retstr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.process_page(page) data = retstr.getvalue() retstr.close() list_pages.append(self.parse_page(fpath, data, page_num)) self.handler.print_footer(fpath) return list_pages except (KeyboardInterrupt, SystemExit): raise except Exception as e: self.handler.print_error(fpath, e)
def dump_pdf_pdfminer(self, fpath_in): fpath_out = os.path.splitext(fpath_in)[0] + ".txt" n = 0 with open(fpath_in, 'rb') as fin: with open(fpath_out, 'wb') as fout: try: laparams = LAParams() laparams.all_texts = True rsrcmgr = PDFResourceManager() pagenos = set() page_num = 0 for page in PDFPage.get_pages(fin, pagenos, check_extractable=True): page_num += 1 retstr = StringIO() device = TextConverter(rsrcmgr, retstr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.process_page(page) data = retstr.getvalue() retstr.close() fout.write(data) n += len(data) print "Written %d bytes to %s" % (n, fpath_out) except (KeyboardInterrupt, SystemExit): raise except Exception as e: print "Failed parsing %s" % (fpath_in)
def _convert_pdf_to_text(self, password=None): input_pdf = self.cvFile if password is not None: self.cvFilePasswd = password pagenos = range(0, 30) maxpages = pagenos.__len__() layoutmode = 'normal' codec = 'utf-8' scale = 1 outtype = 'txt' laparams = LAParams() laparams.all_texts = True laparams.showpageno = True outputPath = self.scratchDir inputPath = os.getcwd() if os.path.exists(input_pdf): inputPath = os.path.dirname(input_pdf) input_filename = os.path.basename(input_pdf) input_parts = input_filename.split(".") input_parts.pop() randomStr = int(time.time()) output_filename = outputPath + os.path.sep + ".".join(input_parts) + randomStr.__str__() + r".txt" self.cvTextFile = output_filename outfp = file(output_filename, 'w') rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) fp = file(input_pdf, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=self.cvFilePasswd, check_extractable=True) fp.close() device.close() outfp.close() return (0)
def _pdf_to_text(path): try: rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'ascii' laparams = LAParams() laparams.all_texts = True device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) with open(path, 'rb') as fp: process_pdf(rsrcmgr, device, fp) device.close() # fix the non-utf8 string ... result = retstr.getvalue() txt = result.encode('ascii','ignore') retVal = (txt,True) retstr.close() except Exception,e: #print str(e) #print "\tERROR: PDF is not formatted correctly, aborting." retVal = ("", False) pass
def parse_pdf_pdfminer(self, f, fpath): try: laparams = LAParams() laparams.all_texts = True rsrcmgr = PDFResourceManager() pagenos = set() if self.dedup: self.dedup_store = set() self.handler.print_header(fpath) page_num = 0 for page in PDFPage.get_pages(f, pagenos, check_extractable=True): page_num += 1 retstr = StringIO() device = TextConverter(rsrcmgr, retstr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.process_page(page) data = retstr.getvalue() retstr.close() self.parse_page(fpath, data, page_num) self.handler.print_footer(fpath) except (KeyboardInterrupt, SystemExit): raise except Exception as e: self.handler.print_error(fpath, e)
def do_import(self, results, filepath): buff = StringIO() fp = open(filepath, "rb") laparams = LAParams() laparams.all_texts = True rsrcmgr = PDFResourceManager() pagenos = set() page_num = 0 for page in PDFPage.get_pages(fp, pagenos, check_extractable=True): page_num += 1 device = TextConverter(rsrcmgr, buff, codec="utf-8", laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.process_page(page) buff.write("\n") results.investigation.update(import_text=buff.getvalue()) fp.close() buff.close()
def count_words(self): """ Thanks to http://pinkyslemma.com/2013/07/02/word-frequency-from-pdfs/ and http://www.unixuser.org/~euske/python/pdfminer/programming.html """ with open(self.filename, "rb") as fp: rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() laparams.all_texts = True device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page) full_text = retstr.getvalue() full_text = full_text.translate(string.maketrans("", ""), string.punctuation) return len(full_text.split())
def parse_pdf_pdfminer(self, f, fpath): try: laparams = LAParams() laparams.all_texts = True rsrcmgr = PDFResourceManager() pagenos = set() self.handler.print_header(fpath) page_num = 0 for page in PDFPage.get_pages(f, pagenos, check_extractable=True): page_num += 1 retstr = StringIO() device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.process_page(page) data = retstr.getvalue() retstr.close() self.parse_page(fpath, data, page_num) self.handler.print_footer(fpath) except (KeyboardInterrupt, SystemExit): raise
def to_text(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() print laparams laparams.all_texts = True device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() pages = PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) for page in pages: interpreter.process_page(page) fp.close() device.close() str = retstr.getvalue() retstr.close() return str
def to_text(path): "Wrapper around pdfminer." from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage from cStringIO import StringIO rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() laparams.all_texts = True device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) with open(path, 'rb') as fp: interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() pages = PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) for page in pages: interpreter.process_page(page) device.close() str = retstr.getvalue() retstr.close() return str
def parse_pdf_pdfminer(self, f, fpath): try: laparams = LAParams() laparams.all_texts = True rsrcmgr = PDFResourceManager() pagenos = set() if self.dedup: self.dedup_store = set() self.handler.print_header(fpath) page_num = 0 parser= PDFParser(f) doc = PDFDocument(caching=True) parser.set_document(doc) doc.set_parser(parser) for page in doc.get_pages(): retstr = StringIO() device = TextConverter(rsrcmgr, retstr, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) page_num += 1 interpreter.process_page(page) data = retstr.getvalue() self.parse_page(fpath, bytes(data,'UTF-8'), page_num) retstr.close() self.handler.print_footer(fpath) except (KeyboardInterrupt, SystemExit): raise except Exception as e: self.handler.print_error(fpath, e)
def extract_text(in_filename, out_filename): text_pdfminer = '' laparams = LAParams() laparams.all_texts = True rsrcmgr = PDFResourceManager() page_num = 0 fp = open(in_filename, 'rb') for page in PDFPage.get_pages(fp): page_num += 1 retstr = StringIO() device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.process_page(page) data = retstr.getvalue() retstr.close() text_pdfminer += data with open(out_filename, 'w') as f: print(text_pdfminer, file=f)
def pdf2str(path): #Allocate resources rsrcmgr = PDFResourceManager() retstr = StringIO() laparams = LAParams() #Set parameters codec = 'utf-8' laparams.all_texts=True laparams.detect_vertical = True caching = True pagenos = set() #Initialize the converter device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) #Open the file and parse fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos,caching=caching, check_extractable=True): interpreter.process_page(page) #Clean up fp.close() device.close() str = retstr.getvalue() retstr.close() return str
def to_text(path): """Wrapper around `pdfminer`. Parameters ---------- path : str path of electronic invoice in PDF Returns ------- str : str returns extracted text from pdf """ try: # python 2 from StringIO import StringIO import sys reload(sys) # noqa: F821 sys.setdefaultencoding('utf8') except ImportError: from io import StringIO import sys sys.path.append("/home/teemo/source/pdfminer/") from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() laparams.all_texts = True device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) with open(path, 'rb') as fp: interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() pages = PDFPage.get_pages( fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True, ) for page in pages: interpreter.process_page(page) device.close() str = retstr.getvalue() retstr.close() return str.encode('utf-8')
def convert(fname, pages=None, M=1.0, L=0.3, W=0.2, F=0.5): """ Converts a pdf filename into plain text. Each value is specified not as an actual length, but as a proportion of the length to the size of each character in question. Parameters define layout analysis. In a PDF text is in several chunks of various types. Text extraction needs to recover text chunks which ar regarded as continuous if elements distance is closer than the char_margin (identified as M) and thus are grouped into one block. Two lines are part of the same text if they are closer than the line_margin (L). If the distance between two words is greater than the word_margin (W), blank characters (spaces) shall be inserted as necessary to keep format. Boxes flow (F) specifies how much a horizontal and vertical position of a text matters when determining text flow order. The value should be within the range from -1.0 (only horizontal position matters) to +1.0 (only vertical position matters). Keyword arguments: fname -- PDF file name (string) pages -- Set of pages to extract (set) M -- char_margin (float) L -- line_margin (float) W -- word_margin (float) F -- boxes_flow (float) Return: text: pdf contents as plain text """ if not pages: pagenums = set() else: pagenums = set(pages) output = BytesIO() codec = "utf-8" manager = PDFResourceManager() laparams = LAParams() laparams.all_texts = True laparams.detect_vertical = False laparams.char_margin = M laparams.line_margin = L laparams.word_margin = W laparams.boxes_flow = F converter = TextConverter(manager, output, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(manager, converter) infile = open(fname, 'rb') for page in PDFPage.get_pages(infile, pagenums): interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() output.close return text
def to_text(path): """Wrapper around `pdfminer`. Parameters ---------- path : str path of electronic invoice in PDF Returns ------- str : str returns extracted text from pdf """ try: # python 2 from StringIO import StringIO import sys reload(sys) # noqa: F821 sys.setdefaultencoding('utf8') except ImportError: from io import StringIO from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import TextConverter from pdfminer.layout import LAParams from pdfminer.pdfpage import PDFPage rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() laparams.all_texts = True device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) with open(path, 'rb') as fp: interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos = set() pages = PDFPage.get_pages( fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True, ) for page in pages: interpreter.process_page(page) device.close() str = retstr.getvalue() retstr.close() return str.encode('utf-8')
def readText(self,path, outtype='text', opts={}): outfile = path[:-3] + outtype outdir = '/'.join(path.split('/')[:-1]) # debug option pagenos = set() maxpages = 0 # output option # ?outfile = None # ?outtype = None outdir = None #layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) print laparams # #PDFDocument.debug = debug #PDFParser.debug = debug CMapDB.debug = self.debug PDFResourceManager.debug = self.debug PDFPageInterpreter.debug = self.debug PDFDevice.debug = self.debug # rsrcmgr = PDFResourceManager() #outtype = 'text' outfp = StringIO() device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) fp = file(path, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, check_extractable=True) fp.close() device.close() print outfp.getvalue() outfp.close() return
def convert_to_text_file(filename_in, filename_out, rewrite=False): """ Parse file according to BORME PDF format filename: filenameOut: """ if os.path.isdir(filename_out): filename_out = os.path.join(filename_out, os.path.basename(filename_in)) if os.path.exists(filename_out) and not rewrite: logging.info('Skipping file %s already exists and rewriting is disabled!' % filename_out) return False # conf codec = 'utf-8' laparams = LAParams() imagewriter = None pagenos = set() maxpages = 0 password = '' rotation = 0 # <LAParams: char_margin=2.0, line_margin=0.5, word_margin=0.1 all_texts=False> laparams.detect_vertical = True laparams.all_texts = False laparams.char_margin = 2.0 laparams.line_margin = 0.5 laparams.word_margin = 0.1 caching = True rsrcmgr = PDFResourceManager(caching=caching) outfp = open(filename_out, 'w') device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) fp = open(filename_in, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) # https://github.com/euske/pdfminer/issues/72 #page = PDFPage() #PDFPage.cropbox = # y esto? for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return True
def to_text(self): rsrcmgr = PDFResourceManager() output = StringIO() laparams = LAParams() laparams.detect_vertical = True laparams.all_texts = True laparams.word_margin = 0.4 device = TextConverter(rsrcmgr, output, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in self._doc.get_pages(): interpreter.process_page(page) return output.getvalue().decode('utf-8', 'ignore')
def get_text(self): """Returns all text content from the PDF as plain text. """ rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() laparams.all_texts = True device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) try: file_pointer = file(self.path, 'rb') process_pdf(rsrcmgr, device, file_pointer) except Exception as e: logging.error("Error processing PDF: %s" % e) raise finally: file_pointer.close() device.close() text = retstr.getvalue() retstr.close() if (text is None) or (text.strip() == ""): logging.info("No text found in PDF. Attempting OCR. This will take a while.") #FIXME this should go in a separate method #First, convert to image import subprocess try: arglist = ["gs", "-dNOPAUSE", "-sOutputFile=temp/page%03d.png", "-sDEVICE=png16m", "-r72", self.path] process = subprocess.call( args=arglist, stdout=subprocess.STDOUT, stderr=subprocess.STDOUT) except OSError: logging.error("Failed to run GhostScript (using `gs`)") #Do OCR import time time.sleep(1) # make sure the server has time to write the files import Image import pytesseract import os text = "" for file_ in os.listdir("temp"): if file_.endswith(".png"): text += pytesseract.image_to_string(Image.open("temp/" + file_), lang="swe") os.unlink("temp/" + file_) self.text = text return text
def convert_pdf_to_string(file_path): output_string = StringIO() laparams = LAParams() laparams.all_texts = True with open(file_path, 'rb') as in_file: parser = PDFParser(in_file) doc = PDFDocument(parser) rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, output_string, laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(doc): interpreter.process_page(page) return (output_string.getvalue())
def _pdf2text(self,fp): try: rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'ascii' laparams = LAParams() laparams.all_texts = True device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) process_pdf(rsrcmgr, device, fp) device.close() # fix the non-utf8 string ... result = retstr.getvalue() txt = result.encode('ascii','ignore') # TODO: clean this up, I feel like I'm doing the converstion twice ... # http://stackoverflow.com/a/16503222/2154772 parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize() #print doc.info[0]['CreationDate'].resolve() # # as messed up as this is ... CreationDate isn't always the same type as it # comes back from the PDFParser, so we need to base it on an instance of a # basestring or not. # created = "" try: if not isinstance(doc.info[0]['CreationDate'],basestring): creatd = doc.info[0]['CreationDate'].resolve()[2:-7] else: created = doc.info[0]['CreationDate'][2:-7] except: self._report("CreationDate field could not be decoded within PDF, setting to ''") pass created = created.encode('ascii','ignore') retVal = (created,txt,True) retstr.close() except Exception, e: self._report("Error: \n\t%s" % str(e)) retVal = (None,"",False) pass
def _pdf2text(self,fp): try: rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'ascii' laparams = LAParams() laparams.all_texts = True device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) process_pdf(rsrcmgr, device, fp) device.close() # fix the non-utf8 string ... result = retstr.getvalue() txt = result.encode('ascii','ignore') # TODO: clean this up, I feel like I'm doing the converstion twice ... # http://stackoverflow.com/a/16503222/2154772 parser = PDFParser(fp) doc = PDFDocument() parser.set_document(doc) doc.set_parser(parser) doc.initialize() #print doc.info[0]['CreationDate'].resolve() # # as messed up as this is ... CreationDate isn't always the same type as it # comes back from the PDFParser, so we need to base it on an instance of a # basestring or not. I'm starting to dislike PDFs ... # if not isinstance(doc.info[0]['CreationDate'],basestring): datestring = doc.info[0]['CreationDate'].resolve()[2:-7] else: datestring = doc.info[0]['CreationDate'][2:-7] #print "working on '{0}'...".format(datestring) ts = strptime(datestring, "%Y%m%d%H%M%S") created = datetime.fromtimestamp(mktime(ts)) retVal = (created,txt,True) retstr.close() except Exception, e: self._reportstr("Error: \n\t%s" %str(e)) retVal = (None,"",False) pass
def getPdfAsText(pdfPages = None, fileDescriptor = None): if pdfPages is None and fileDescriptor is not None: pdfPages = getPdfPages(fileDescriptor) resourceManager = PDFResourceManager() laparams = LAParams() laparams.all_texts = True laparams.detect_vertical = True try: outputStream = StringIO.StringIO() device = TextConverter(resourceManager, outputStream, laparams=laparams) intrepreter = PDFPageInterpreter(resourceManager, device) for pdfPage in pdfPages: intrepreter.process_page(pdfPage) return outputStream.getvalue() finally: device.close() outputStream.close()
def read_pdf(fp, password='', *page_numbers): # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser, password) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed rsrcmgr = PDFResourceManager(caching=True) laparams = LAParams() laparams.all_texts = False device = TextAnalyzer(rsrcmgr, sys.stdout, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page) device.close() return device.get_result()
def __init__(self, pdffile): """Create the PDF Document object Reads a PDF file and turns it into a text string and extracts some document info """ self.scores = {} laparams = LAParams() laparams.all_texts = True sio = StringIO() fp = open(pdffile, "rb") # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(PDFParser(fp)) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. device = PDFDevice(rsrcmgr) device = TextConverter(rsrcmgr, sio, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page) self.pdffile_text = sio.getvalue() self.info = document.info # 20190915234815+02'00' self.creation_date = datetime.strptime( str(self.info[0]["CreationDate"]).split("+")[0].split(":")[1], "%Y%m%d%H%M%S", ) fp.close() device.close() sio.close()
def pdf(f): rsrcmgr = PDFResourceManager() retstr = cStringIO.StringIO() codec = 'utf-8' laparams = LAParams() laparams.all_texts = True device = TextConverter( rsrcmgr, retstr, codec=codec, laparams=laparams ) fp = file(f, 'rb') process_pdf(rsrcmgr, device, fp) fp.close() device.close() str = retstr.getvalue() retstr.close() return str
def to_text(path): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() print laparams laparams.all_texts = True device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() str = retstr.getvalue() retstr.close() return str
def pdfcn(): laparams = LAParams() laparams.all_texts = True ###从之前程序崩溃的地方重新启动,查找转换的pdf filelist4 = [] finallist = [] path2 = r'D:\dataset\acl10_12_txt' filelist2 = os.listdir(path2) path3 = r'D:\dataset\acl10_12s' filelist3 = os.listdir(path3) for i in filelist2: filelist4.append(i[:-4]) print filelist4 for filename in filelist3: #print filename[:-4] if filename[:-4] not in filelist4: finallist.append(filename[:-4]) #print finallist #path = r'D:\dataset\aclpdf2' #filelist = os.listdir(path) for pdf in finallist: try: outfile = "D:\\dataset\\acl10_12_txt\\"+pdf+".txt" codec = 'utf-8' args = [path3+'\\'+pdf+'.pdf'] rsrc = PDFResourceManager() outfp = file(outfile, 'w') device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams) for fname in args: fp = file(fname, 'rb') process_pdf(rsrc, device, fp, None, maxpages=0, password='') print '%s finishing ' % pdf fp.close() except: continue device.close() outfp.close()
def parse_pdf(path): fd = open(path, 'rb') retstr = StringIO() laparams = LAParams() laparams.all_texts = True laparams.detect_vertical = True rmngr = PDFResourceManager(caching=True) device = MyTextConverter(rmngr, retstr, laparams=laparams, imagewriter=None) interpreter = PDFPageInterpreter(rmngr, device) for page in PDFPage.get_pages(fd, set(), check_extractable=True): interpreter.process_page(page) fulltext = (''.join(device.text_output)).strip() fd.close() if len(fulltext) == 0: return [] lines = fulltext.split("\n") return lines
def convert_pdf(target_fn): ''' Convert a pdf file into a string of text ''' laparams = LAParams() laparams.all_texts = True laparams.detect_vertical = True resource_manager = PDFResourceManager(caching=True) output_fh = StringIO.StringIO() device = TextConverter(resource_manager, output_fh, codec='utf-8', laparams=laparams, imagewriter=None) interpreter = PDFPageInterpreter(resource_manager, device) with open(target_fn, 'rb') as f: for page in PDFPage.get_pages(f): interpreter.process_page(page) device.close() output_fh.seek(0) content = output_fh.read().decode('utf-8') return content
def to_text(path): """Wrapper around pdfminer. Returns whole text as first value, pdf layouts with corresponding pages as second""" rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() laparams.all_texts = False device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) deviceLayout = PDFPageAggregator(rsrcmgr, laparams=laparams) interpreterLayout = PDFPageInterpreter(rsrcmgr, deviceLayout) password = "" maxpages = 0 caching = True pagenos = set() pages = PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) objects = [] for page_n, page in enumerate(pages): interpreter.process_page(page) interpreterLayout.process_page(page) layout = deviceLayout.get_result() objects.append((content_from_layout(layout), page_n)) fp.close() device.close() deviceLayout.close() string = retstr.getvalue() retstr.close() return string, objects
def main(argv=None): parser = argparse.ArgumentParser(description='Convert PDF into text.') parser.add_argument('file', nargs='*', type=argparse.FileType('rb'), default=sys.stdin, help='file(s) to convert') parser.add_argument('-C', '--nocache', dest='cache', action='store_false', help='prevent object caching (slower)') parser.add_argument('-l', metavar='level', default='warn', help='logging level (warn, info, debug)') parser.add_argument('-p', metavar='page', nargs='+', default=[], type=int, help='page number(s) (space separated)') parser.add_argument('-m', metavar='maxpages', default=0, type=int, help='maximum number of pages to extract') parser.add_argument('-P', metavar='password', default='', help='pdf password') parser.add_argument('-o', metavar='outfile', type=argparse.FileType('w'), default=sys.stdout, help='output file name (default: stdout)') parser.add_argument('-O', metavar='directory', type=ImageWriter, help='extract images and save to directory') parser.add_argument('-t', metavar='outtype', help='output type (text, html, xml, tag)') parser.add_argument('-c', metavar='codec', default='utf-8', help='output text encoding (default: %(default)s)') lagroup = parser.add_argument_group(title='layout analysis') lagroup.add_argument('-n', action='store_true', help='disable layout analysis') lagroup.add_argument('-A', action='store_true', help='force layout analysis on all text') lagroup.add_argument('-V', action='store_true', help='detect vertical text') lagroup.add_argument('-M', metavar='char_margin', type=float, help='custom character margin') lagroup.add_argument('-L', metavar='line_margin', type=float, help='custom line margin') lagroup.add_argument('-W', metavar='word_margin', type=float, help='custom word margin') lagroup.add_argument('-F', metavar='boxes_flow', type=float, help='custom boxes flow') lagroup.add_argument('-Y', metavar='layout_mode', default='normal', help='layout mode for HTML (normal, exact, loose)') lagroup.add_argument('-s', metavar='scale', default=1, type=float, help='output scaling for HTML') args = parser.parse_args(argv) logging.basicConfig() logging.getLogger('pdfminer').setLevel(args.l.upper()) laparams = LAParams() if args.n: laparams = None else: laparams.all_texts = args.A laparams.detect_vertical = args.V if args.M: laparams.char_margin = args.M if args.L: laparams.line_margin = args.L if args.W: laparams.word_margin = args.W if args.F: laparams.boxes_flow = args.F rsrcmgr = PDFResourceManager(caching=args.cache) outtype = args.t if not outtype: if args.o: if args.o.name.endswith('.htm') or args.o.name.endswith('.html'): outtype = 'html' elif args.o.name.endswith('.xml'): outtype = 'xml' elif args.o.name.endswith('.tag'): outtype = 'tag' if outtype == 'xml': device = XMLConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O) elif outtype == 'html': device = HTMLConverter(rsrcmgr, args.o, codec=args.c, scale=args.s, layoutmode=args.Y, laparams=laparams, imagewriter=args.O) elif outtype == 'tag': device = TagExtractor(rsrcmgr, args.o, codec=args.c) else: device = TextConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O) for fp in args.file: process_pdf(rsrcmgr, device, fp, [i - 1 for i in args.p], maxpages=args.m, password=args.P, caching=args.cache, check_extractable=True) fp.close() device.close() if args.o is not sys.stdout: args.o.close()
#parser = PDFParser(open_file) # Create a PDF document object that stores the document structure. #doc = PDFDocument(parser) # Connect the parser and document objects. #print parser.nextline() #print parser.nextline() #print parser.nextline() ##ATTEMPT 2 #Code from pdf2txt.py laparams = LAParams() laparams.char_margin = 2.0 laparams.line_margin=0.5 laparams.word_margin=0.1 laparams.all_texts=False rsrcmgr = PDFResourceManager() device = TextConverter(rsrcmgr, fp_out, codec='utf-8', laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pdf_pages = PDFPage.get_pages(fp_in, set()) pagenum = 0 pagelim = 3 for page in pdf_pages: pagenum += 1 if pagenum > pagelim: continue print "Transcribing page " + str(pagenum) + " from PDF to text" interpreter.process_page(page) fp_in.close() fp_out.close()
def main(argv): # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = False laparams = LAParams() using_optparse = False parser = ArgumentParser(prog='pdf2txt.py', description='Convert pdf to txt', formatter_class=ArgumentDefaultsHelpFormatter) if using_optparse: DEBUG(3, 'using optparse') parser.add_argument = parser.add_option parser.parse_known_args = parser.parse_args parser.disable_interspersed_args() parser.add_argument('-d', dest='debuglevel', action='count', default=0, help='Debug (repeat for more verbose debugging)') parser.add_argument( '-p', '--pages', dest='pagenos', action='store', type=str, default='', help= 'Specifies the comma-separated list of the page numbers to be extracted. Page numbers start at one. By default, it extracts text from all the pages.' ) parser.add_argument('-c', '--codec', dest='codec', action='store', type=str, default='utf-8', help='Specifies the output codec.') parser.add_argument( '-t', '--type', dest='outtype', action='store', type=str, default='shape', choices=['text', 'html', 'xml', 'tag', 'shape'], help='Specifies the output format, one of: shape, text, html, xml, tag' ) parser.add_argument( '-m', dest='maxpages', action='store', type=int, default=0, help= 'Specifies the maximum number of pages to extract. By default (0), it extracts all the pages in a document.' ) parser.add_argument( '-P', '--password', dest='password', action='store', type=str, default='', help='Provides the user password to access PDF contents.') parser.add_argument( '-o', '--output', dest='outfile', action='store', type=str, default=None, help= 'Specifies the output file name. By default, it prints the extracted contents to stdout in text format.' ) parser.add_argument( '-C', '--no-caching', dest='caching', action='store_false', default=True, help= 'Suppress object caching. This will reduce the memory consumption but also slows down the process.' ) parser.add_argument('-n', '--no-layout', dest='layout', action='store_false', default=True, help='Suppress layout analysis.') parser.add_argument('--show-pageno', dest='show_pageno', action='store_true', default=False, help='Show page numbers.') parser.add_argument( '-A', '--analyze-all', dest='all_texts', action='store_true', default=False, help= 'Forces to perform layout analysis for all the text strings, including text contained in figures.' ) parser.add_argument('-V', '--detect-vertical', dest='detect_vertical', action='store_true', default=False, help='Allows vertical writing detection.') parser.add_argument( '-M', dest='char_margin', action='store', type=float, default=2.0, help= 'Two text chunks whose distance is closer than the char_margin (shown as M) is considered continuous and get grouped into one.' ) parser.add_argument( '-L', dest='line_margin', action='store', type=float, default=0.5, help= 'Two lines whose distance is closer than the line_margin (L) is grouped as a text box, which is a rectangular area that contains a "cluster" of text portions.' ) parser.add_argument( '-W', dest='word_margin', action='store', type=float, default=0.1, help= 'It may be required to insert blank characters (spaces) as necessary if the distance between two words is greater than the word_margin (W), as a blank between words might not be represented as a space, but indicated by the positioning of each word.' ) parser.add_argument( '-F', dest='boxes_flow', action='store', type=float, default=0.5, help= 'Specifies how much a horizontal and vertical position of a text matters when determining a text order. The value should be within the range of -1.0 (only horizontal position matters) to +1.0 (only vertical position matters).' ) parser.add_argument( '-Y', '--layout-mode', dest='layoutmode', action='store', type=str, default='normal', choices=['exact', 'normal', 'loose'], help= 'Specifies how the page layout should be preserved. (Currently only applies to HTML format.) One of: exact, normal, loose.' ) parser.add_argument('-O', '--image-writer', dest='imagewriter', action='store', type=str, default=None, help='imagewriter') parser.add_argument('-R', '--rotation', dest='rotation', action='store', type=int, default=0, help='rotation') parser.add_argument('-S', '--strip-control', dest='stripcontrol', action='store_true', default=False, help='stripcontrol') parser.add_argument( '-s', dest='scale', action='store', type=float, default=1, help='Specifies the output scale. Can be used in HTML format only.') parser.add_argument( '--draw-lines', dest='draw_lines', action='store_true', help= "Draw crude page representation, coloured TextLines (= short pieces of text). Valid only for the `shape' output." ) parser.add_argument( '--draw-boxes', dest='draw_boxes', action='store_true', help= "Draw crude page representation, coloured TextBoxes (= grouped text lines). Valid only for the `shape' output." ) parser.add_argument( '--draw-blocks', dest='draw_blocks', action='store_true', help= "Draw crude page representation, coloured TextBlocks (= grouped TextBoxes). Valid only for the `shape' output." ) parser.add_argument( '--shear-limit', dest='shear_limit', action='store', default=0.1, type=float, help= "If the text is sheared above this limit, reject it. Valid only for the `shape' output." ) parser.add_argument( '--rotation-limit', dest='rotation_limit', action='store', default=2, type=float, help= "If the text is rotated above this angle (in degrees), reject it. Valid only for the `shape' output." ) parser.add_argument( '--line-height-diff', dest='line_height_diff', action='store', type=float, default=0.1, help= 'Two lines whose vertical sizes differ more than this ratio are not to be considered of the same paragraph (but e.g. one of them is a heading).' ) parser.add_argument('--heading-before', dest='heading_before', action='store', type=str, default='', help='String to put before each heading, e.g. <h1>') parser.add_argument('--heading-after', dest='heading_after', action='store', type=str, default='', help='String to put after each heading, e.g. </h1>') parser.add_argument( '--box-separator', dest='box_separator', action='store', type=str, default=r'\n\n', help= r'Separate boxes with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.' ) parser.add_argument( '--block-separator', dest='block_separator', action='store', type=str, default=r'\n\n', help= r'Separate blocks with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.' ) parser.add_argument( '--indent-separator', dest='indent_separator', action='store', type=str, default=r'\n\n', help= r'Separate indented lines with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.' ) parser.add_argument( '--indent-string', dest='indent_string', action='store', type=str, default=r'\t', help= r'Put this string in front of indented lines. Use \n for new line, \t for TAB, other escape sequences are not recognized.' ) parser.add_argument( '--indent-limit', dest='indent_limit', action='store', type=float, default=3, help= 'If the line is indented more then this (approximately characters), it will separated by --indent-separator from the previous one.' ) parser.add_argument( '--page-separator', dest='page_separator', action='store', type=str, default=r'\n\n', help= r'Separate pages with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.' ) parser.add_argument( '--norm-whitespace', dest='norm_whitespace', action='store_true', default=False, help= 'Normalize whitespace (remove duplicate spaces, replace end of lines with spaces).' ) parser.add_argument( '--print-stats', dest='print_stats', action='store_true', default=False, help= 'Instead of the text, output some simple statistics about the file.') parser.add_argument( '--max-blocks', dest='max_blocks', action='store', default=0, type=int, help= 'If there is more than this blocks per page, do not return any text. Use to discriminate abnormal files (run --print-stats first to find out the number of boxes per "normal" file). 0 means no limit. 50 is maybe a good value.' ) parser.add_argument( '--max-textlines', dest='max_textlines', action='store', default=0, type=int, help= 'If there is more than this textlines per any block, do not return any text. Use to discriminate abnormal files (run --print-stats first to find out the number of boxes per "normal" page). 0 means no limit. 18 is maybe a good value.' ) parser.add_argument( '--line-height-method', dest='line_height_method', action='store', type=str, default='bbox', choices=['bbox', 'mean', 'median'], help= 'Method to calculate height of line (relevant if there are characters with uneven height). bbox takes the bounding box (rectangle encompassing the line), mean the arithmetic mean of the height of all the characters, median is the median of the height of all the characters. Use mean or median if there are outlier characters, e.g. one big character at the beginning of line.' ) parser.add_argument(dest='pdffile', help='List of PDF files to go through', default=None, nargs='+') args, rest = parser.parse_known_args() global debuglevel debuglevel = debug = args.debuglevel DEBUG(3, 'args:', str(args)) DEBUG(3, 'rest:', str(rest)) DEBUG(3, 'optparse:', using_optparse) if args.pagenos: pagenos.update(int(x) - 1 for x in args.pagenos.split(',')) maxpages = args.maxpages outfile = args.outfile password = args.password caching = args.caching showpageno = args.show_pageno if not args.layout: laparams = None if laparams and args.all_texts: laparams.all_texts = True if laparams and args.detect_vertical: laparams.detect_vertical = True if laparams: laparams.char_margin = args.char_margin laparams.line_margin = args.line_margin laparams.word_margin = args.word_margin laparams.boxes_flow = args.boxes_flow layoutmode = args.layoutmode if args.imagewriter: imagewriter = ImageWriter(args.imagewriter) rotation = args.rotation stripcontrol = args.stripcontrol outtype = args.outtype codec = args.codec scale = args.scale args.box_separator = unescape_string(args.box_separator) args.block_separator = unescape_string(args.block_separator) args.indent_separator = unescape_string(args.indent_separator) args.indent_string = unescape_string(args.indent_string) args.page_separator = unescape_string(args.page_separator) global options options = args PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') DEBUG(2, 'output goes to', outfile) else: outfp = sys.stdout DEBUG(2, 'output goes to stdout') if outtype == 'shape': device = ShapeTextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, showpageno=showpageno, imagewriter=imagewriter) elif outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in options.pdffile: DEBUG(2, 'processing', fname) fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() DEBUG(2, 'finished.') return
def readPDF2HTML(pdfFile, opts={}): # open a PDF file fp = StringIO(pdfFile.read()) retstr = StringIO() # create a PDF parser object associated with the file object parser = PDFParser(fp) # create a PDF document allows text extraction document = PDFDocument(parser) # password if needed # check if document allows text extraction without password if not document.is_extractable: raise PDFTextExtractionNotAllowed # create a PDF resource manager object that sotres shared resources rsrcmgr = PDFResourceManager() # create a PDF device object laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) codec = 'utf-8' device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) # create a PDF interpreter object interpreter = PDFPageInterpreter(rsrcmgr, device) pagenos = set() # process each page contained in the document for page in PDFPage.get_pages(fp, pagenos): interpreter.process_page(page) # close streams and return text content fp.close() content = retstr.getvalue() device.close() retstr.close() return content
def main(argv): import getopt def usage(): print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]' ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]' ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]' ' [-t text|html|xml|tag] [-c codec] [-s scale]' ' file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return
def parsepdf_pdfminer_formal(path, outtype='txt'): # debug option debug = 0 # input option password = b'' pagenos = set() maxpages = 0 # output option outfile = r'C:\Users\Administrator\Desktop\parseRes_demo.' + outtype imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' encoding = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() laparams.all_texts = True laparams.detect_vertical = True # for (k, v) in opts: # if k == '-d': debug += 1 # elif k == '-P': password = v.encode('ascii') # elif k == '-o': outfile = v # elif k == '-t': outtype = v # elif k == '-O': imagewriter = ImageWriter(v) # elif k == '-c': encoding = v # elif k == '-s': scale = float(v) # elif k == '-R': rotation = int(v) # elif k == '-Y': layoutmode = v # elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) # elif k == '-m': maxpages = int(v) # elif k == '-S': stripcontrol = True # elif k == '-C': caching = False # elif k == '-n': laparams = None # elif k == '-A': laparams.all_texts = True # elif k == '-V': laparams.detect_vertical = True # elif k == '-M': laparams.char_margin = float(v) # elif k == '-W': laparams.word_margin = float(v) # elif k == '-L': laparams.line_margin = float(v) # elif k == '-F': laparams.boxes_flow = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = open(outfile, 'w', encoding=encoding) if outtype == 'txt': device = TextConverter(rsrcmgr, outfp, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp) # else: # return usage() # for fname in args: with open(path, 'rb') as fp: interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) device.close() outfp.close() return
def main(argv): import getopt def usage(): print( 'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]' ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]' ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]' ' [-t text|html|xml|tag] [-c codec] [-s scale]' ' file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return
def main(argv): import getopt #getopt 模块,它的功能是 获取执行命令行时附带的参数,关于getopt模块详细可参照http://www.16kan.com/post/207647.html def usage(): #usage() 函数,用于在用户输入错误命令或者命令输入不规范时,输出py文件的使用范例。当参数不足或错误时,usage()被调用 print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] ' '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] ' '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') ''' getopt函数的格式是getopt.getopt ( [命令行参数列表], "短选项", [长选项列表] ) 短选项名后的冒号(:)表示该选项必须有附加的参数。p,m,P,o,M,L,W,F,Y,O,t,c,s均为必须参数 长选项名后的等号(=)表示该选项必须有附加的参数。 返回opts和args。 ''' except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' #参数P pagenos = set() #参数p maxpages = 0 #参数m # output option outfile = None #参数o output outtype = None #参数t out type outdir = None #参数O output directory layoutmode = 'normal' #参数Y codec = 'utf-8' #参数c pageno = 1 scale = 1 #参数s,暂缺M,L,F,Y四个参数 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: #确认输出文件格式 outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) #TextConverter貌似不能指定outdir参数 elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() outfp.close() return
def main(argv): import getopt def usage(): print( f'usage: {argv[0]} [-P password] [-o output] [-t text|html|xml|tag]' ' [-O output_dir] [-c encoding] [-s scale] [-R rotation]' ' [-Y normal|loose|exact] [-p pagenos] [-m maxpages]' ' [-S] [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin]' ' [-W word_margin] [-F boxes_flow] [-d] input.pdf ...') return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dP:o:t:O:c:s:R:Y:p:m:SCnAVM:W:L:F:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = b'' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' encoding = 'utf-8' # pageno = 1 scale = 1 caching = True # showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-P': password = v.encode('ascii') elif k == '-o': outfile = v elif k == '-t': outtype = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-c': encoding = v elif k == '-s': scale = float(v) elif k == '-R': rotation = int(v) elif k == '-Y': layoutmode = v elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-S': stripcontrol = True elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = open(outfile, 'w', encoding=encoding) else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp) else: return usage() for fname in args: with open(fname, 'rb') as fp: interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) device.close() outfp.close() bad_words = [ 'Personal', 'Information', 'Projects', 'Internship', 'Technologies' ] with open('cv.txt') as oldfile, open('cv_new.txt', 'w') as newfile: for line in oldfile: if not any(bad_word in line for bad_word in bad_words): newfile.write(line) file = open("cv_new.txt", "r") s = file.read() s = s.split('\n') while ("" in s): s.remove("") while (" " in s): s.remove(" ") while ("\x0c" in s): s.remove("\x0c") details = [] i = 0 while (i < len(s)): s1 = s[i].split(': ') if (len(s1) > 1): details.append(s1[1]) i += 1 sql = "INSERT INTO entries (name, post, exp) VALUES (%s, %s, %s)" val = (details[0], details[1], details[2]) mycursor.execute(sql, val) mydb.commit() print(mycursor.rowcount, "record inserted.") return
def main(argv): import getopt def usage(): print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] ' '[-n] [-A] [-D writing_mode] [-M char_margin] [-L line_margin] [-W word_margin] ' '[-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAD:M:L:W:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None codec = 'utf-8' pageno = 1 scale = 1 showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-D': laparams.writing_mode = v elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # CMapDB.debug = debug PDFResourceManager.debug = debug PDFDocument.debug = debug PDFParser.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrc = PDFResourceManager() if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams) elif outtype == 'xml': device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir) elif outtype == 'tag': device = TagExtractor(rsrc, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages, password=password) fp.close() device.close() outfp.close() return
def pdf2txt(argv): import getopt (opts, args) = getopt.getopt(argv[0:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() outfp.close() return
def main(argv): import getopt def usage(): print( 'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] ' '[-n] [-A] [-D writing_mode] [-M char_margin] [-L line_margin] [-W word_margin] ' '[-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAD:M:L:W:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None codec = 'utf-8' pageno = 1 scale = 1 showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-D': laparams.writing_mode = v elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # CMapDB.debug = debug PDFResourceManager.debug = debug PDFDocument.debug = debug PDFParser.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrc = PDFResourceManager() if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams) elif outtype == 'xml': device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir) elif outtype == 'tag': device = TagExtractor(rsrc, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages, password=password) fp.close() device.close() outfp.close() return
def main(argv): import getopt def usage(): print(f'usage: {argv[0]} [-P password] [-o output] [-t text|html|xml|tag]' ' [-O output_dir] [-c encoding] [-s scale] [-R rotation]' ' [-Y normal|loose|exact] [-p pagenos] [-m maxpages]' ' [-S] [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin]' ' [-W word_margin] [-F boxes_flow] [-d] input.pdf ...') return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dP:o:t:O:c:s:R:Y:p:m:SCnAVM:W:L:F:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = b'' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' encoding = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-P': password = v.encode('ascii') elif k == '-o': outfile = v elif k == '-t': outtype = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-c': encoding = v elif k == '-s': scale = float(v) elif k == '-R': rotation = int(v) elif k == '-Y': layoutmode = v elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-S': stripcontrol = True elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: if sys.platform == 'linux': outfp = open(outfile, 'w', encoding=encoding) elif sys.platform == 'win32': outfp = open(outfile, 'wb') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp) else: return usage() for fname in args: with open(fname, 'rb') as fp: interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) device.close() outfp.close() return
def convert_pdf_To_Txt(path,opts={}): """ this ALGO form pdfinterp modul documentation """ # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout retstr = StringIO() if outtype == 'text': device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams, imagewriter=imagewriter) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) #print retstr.getvalue() txt2Pdf=retstr.getvalue() #print type(txt2Pdf) #fp.close() #device.close() #outfp.close() return txt2Pdf
def ConvertPdf(pdfpath, outfp, opts={}): import sys from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfparser import PDFParser from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfdevice import PDFDevice, TagExtractor from pdfminer.pdfpage import PDFPage from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter from pdfminer.cmapdb import CMapDB from pdfminer.layout import LAParams from pdfminer.image import ImageWriter debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # CMapDB.debug = debug PDFResourceManager.debug = debug PDFDocument.debug = debug PDFParser.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager() if not outtype: outtype = 'txt' if outtype == 'txt': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) fp = file(pdfpath, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() return True
def main(argv=None): parser = argparse.ArgumentParser(description='Convert PDF into text.') parser.add_argument('file', nargs='*', type=argparse.FileType('rb'), default=sys.stdin, help='file(s) to convert') parser.add_argument('-C', '--nocache', dest='cache', action='store_false', help='prevent object caching (slower)') parser.add_argument('-l', metavar='level', default='warn', help='logging level (warn, info, debug)') parser.add_argument('-p', metavar='page', nargs='+', default=[], type=int, help='page number(s) (space separated)') parser.add_argument('-m', metavar='maxpages', default=0, type=int, help='maximum number of pages to extract') parser.add_argument('-P', metavar='password', default='', help='pdf password') parser.add_argument('-o', metavar='outfile', type=argparse.FileType('w'), default=sys.stdout, help='output file name (default: stdout)') parser.add_argument('-O', metavar='directory', type=ImageWriter, help='extract images and save to directory') parser.add_argument('-t', metavar='outtype', help='output type (text, html, xml, tag)') parser.add_argument('-c', metavar='codec', default='utf-8', help='output text encoding (default: %(default)s)') lagroup = parser.add_argument_group(title='layout analysis') lagroup.add_argument('-n', action='store_true', help='disable layout analysis') lagroup.add_argument('-A', action='store_true', help='force layout analysis on all text') lagroup.add_argument('-V', action='store_true', help='detect vertical text') lagroup.add_argument('-M', metavar='char_margin', type=float, help='custom character margin') lagroup.add_argument('-L', metavar='line_margin', type=float, help='custom line margin') lagroup.add_argument('-W', metavar='word_margin', type=float, help='custom word margin') lagroup.add_argument('-F', metavar='boxes_flow', type=float, help='custom boxes flow') lagroup.add_argument('-Y', metavar='layout_mode', default='normal', help='layout mode for HTML (normal, exact, loose)') lagroup.add_argument('-s', metavar='scale', default=1, type=float, help='output scaling for HTML') args = parser.parse_args(argv) logging.basicConfig() logging.getLogger('pdfminer').setLevel(args.l.upper()) laparams = LAParams() if args.n: laparams = None else: laparams.all_texts = args.A laparams.detect_vertical = args.V if args.M: laparams.char_margin = args.M if args.L: laparams.line_margin = args.L if args.W: laparams.word_margin = args.W if args.F: laparams.boxes_flow = args.F rsrcmgr = PDFResourceManager(caching=args.cache) outtype = args.t if not outtype: if args.o: if args.o.name.endswith('.htm') or args.o.name.endswith('.html'): outtype = 'html' elif args.o.name.endswith('.xml'): outtype = 'xml' elif args.o.name.endswith('.tag'): outtype = 'tag' if outtype == 'xml': device = XMLConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O) elif outtype == 'html': device = HTMLConverter(rsrcmgr, args.o, codec=args.c, scale=args.s, layoutmode=args.Y, laparams=laparams, imagewriter=args.O) elif outtype == 'tag': device = TagExtractor(rsrcmgr, args.o, codec=args.c) else: device = TextConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O) for fp in args.file: process_pdf(rsrcmgr, device, fp, [i-1 for i in args.p], maxpages=args.m, password=args.P, caching=args.cache, check_extractable=True) fp.close() device.close() if args.o is not sys.stdout: args.o.close()
def main(argv): import getopt def usage(): print( "usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] " "[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] " "[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ..." % argv[0] ) return 100 try: (opts, args) = getopt.getopt(argv[1:], "dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:") except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = "" pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None layoutmode = "normal" codec = "utf-8" pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == "-d": debug += 1 elif k == "-p": pagenos.update(int(x) - 1 for x in v.split(",")) elif k == "-m": maxpages = int(v) elif k == "-P": password = v elif k == "-o": outfile = v elif k == "-C": caching = False elif k == "-n": laparams = None elif k == "-A": laparams.all_texts = True elif k == "-V": laparams.detect_vertical = True elif k == "-M": laparams.char_margin = float(v) elif k == "-L": laparams.line_margin = float(v) elif k == "-W": laparams.word_margin = float(v) elif k == "-F": laparams.boxes_flow = float(v) elif k == "-Y": layoutmode = v elif k == "-O": outdir = v elif k == "-t": outtype = v elif k == "-c": codec = v elif k == "-s": scale = float(v) # # PDFDocument.debug = debug # PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = "text" if outfile: if outfile.endswith(".htm") or outfile.endswith(".html"): outtype = "html" elif outfile.endswith(".xml"): outtype = "xml" elif outfile.endswith(".tag"): outtype = "tag" if outfile: outfp = file(outfile, "w") else: outfp = sys.stdout if outtype == "text": device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) elif outtype == "xml": device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir) elif outtype == "html": device = HTMLConverter( rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir ) elif outtype == "tag": device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, "rb") process_pdf( rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True ) fp.close() device.close() outfp.close() return
def main(files=None): if files is None: files = get_datafiles() # debug option level debug = 0 # input option password = '' pagenos = set() # pagenos.update( int(x)-1 for x in v.split(',') ) maxpages = 0 # output option rotation = 0 stripcontrol = False layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True rsrcmgr = PDFResourceManager(caching=caching) showpageno = True # Line Agumentation ? Parameters laparams = LAParams() laparams.all_texts = True laparams.detect_vertical = True laparams.line_overlap = 0.3 # Line overlap laparams.char_margin = 2.0 # Letter Spacing laparams.line_margin = 0.5 # Line Spacing laparams.word_margin = 0.1 # Word spacing laparams.boxes_flow = 0.5 # +-1.0 how much hor vs. vertical matters # position maters for line continuation # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # for fname in files: fname = str(fname) imagedir = os.path.abspath(os.path.join(os.path.dirname(fname), 'img')) # print(imagedir) imagewriter = None imagewriter = ImageWriter(imagedir) # output folder for images name = os.path.splitext(os.path.basename(fname))[0] print(name) outfile = fname[:-4] + '.txt' device = TextCon(rsrcmgr, laparams=laparams, imagewriter=imagewriter, imagename=name) interpreter = PDFPageInterpreter(rsrcmgr, device) fp = file(fname, 'rb') try: for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) except: continue rows = [list(row) for row in device.rows] pages = max([row[0] for row in rows]) max_y = max([row[4] for row in rows]) min_y = min([row[2] for row in rows]) list_0 = [int(row[4]) for row in rows] list_1 = [] [ list_1.append(obj) for obj in list_0 if obj not in list_1 and list_0.count(obj) > pages - 1 ] max_y2 = max(list_1) list_0 = [int(row[2]) for row in rows] list_1 = [] [ list_1.append(obj) for obj in list_0 if obj not in list_1 and list_0.count(obj) > pages - 1 ] min_y2 = min(list_1) print('max_ys:', max_y - max_y2) print('min_ys:', min_y - min_y2) # Get max and min the hard way because of stupid headers list_0 = [int(row[3]) for row in rows] list_1 = [] [ list_1.append(obj) for obj in list_0 if obj not in list_1 and list_0.count(obj) > 10 ] if list_1: max_x = max(list_1) else: max_x = max([int(row[3]) for row in device.rows]) list_0 = [int(row[1]) for row in rows] list_1 = [] [ list_1.append(obj) for obj in list_0 if obj not in list_1 and list_0.count(obj) > 10 ] if list_1: min_x = min(list_1) else: min_x = min([int(row[3]) for row in device.rows]) # Errors if more pics on one side then other # mid_x = (sum([(float(row[1]) + float(row[3]))/2 for row in # device.rows])/len(device.rows)) mid_x = (max_x + min_x) / 2 # mid_x = 595/2 # center of A4 at 72px/in Letter would be 612/2 l_height = sum([row[4] - row[2] for row in rows]) / len(rows) # print('max_x:', max_x) # print('min_x:', min_x) # print('mid_x:', mid_x) print('l_height:', l_height) column2 = [] lines = [] pagenumber = 0 table_caps = ['\n'] table_data = [] table = False for i, row in enumerate(rows): #l_height = row[4]-row[2] l_space = rows[i - 1][2] - row[4] #print(l_height, l_space, rows[i-1][2], rows[i][4], str(row[5])) if row[0] == pagenumber + 1: lines += column2 column2 = [] pagenumber += 1 if row[0] == pagenumber: if (max_y - min_y) * 0.95 > l_space > 0.8 * l_height: # capture Table (assuming tables will span all columns) if re.match(r"^table", str(row[5]), re.I): table = True table_caps.append(str(row[5])) table_data.append('\n') table_data.append(str(row[5])) table_data.append('\n') continue else: table = False # capture table captions multi lines elif (table_caps[-1] == str(rows[i - 1][5]) and -2 * l_height < l_space < 0.5 * l_height): table_caps[-1] += str(row[5]) table_data[-2] += str(row[5]) continue if table: # capture table data if int(rows[i - 1][2]) == int(rows[i][2]): table_data[-1] += '\t' + str(row[5]) continue else: table_data.append(str(row[5])) continue elif int(row[1]) > mid_x and ((int(rows[i - 1][1]) < mid_x and int(rows[i - 1][3]) < mid_x) or (int(rows[i - 1][1]) > mid_x and int(rows[i - 1][3]) > mid_x) or rows[i - 1][3] > max_x * 0.9 or l_space > 2.5 * l_height): """ r_space > c_space or previous[3] > max_x * 0.9 or l_space > 2 * l_height):""" if len(column2) > 0: if 1 > (row[2] - column2[-1][2]) > -1: # join if on same line if int(row[1]) < int(column2[-1][1]): column2[-1][5] = row[5] + " " + column2[-1][5] else: column2[-1][5] = column2[-1][5] + " " + row[5] else: column2.append(row) else: column2.append(row) # print(2, str(row[5])) else: if len(lines) > 0: if 1 > (row[2] - lines[-1][2]) > -1: # join if on same line if int(row[1]) < int(lines[-1][1]): lines[-1][5] = row[5] + " " + lines[-1][5] else: lines[-1][5] = lines[-1][5] + " " + row[5] else: lines.append(row) else: lines.append(row) # print(3, str(row[5])) # add final column lines += column2 fig_caps = ['\n'] headers = ['\n'] footers = ['\n'] supp_info = ['\n'] new_lines = [] supp_re = re.compile( r"Corresponding author|Electronic mail|email" "|E-mail|^doi|doi:|^keywords|^pacs|^apc", re.I) for i, line in enumerate(lines): #l_height = lines[i][4]-lines[i][2] l_space = lines[i - 1][2] - lines[i][4] l_space_below = 0 l_space_2below = 0 if i + 1 < len(lines): l_space_below = lines[i][2] - lines[i + 1][4] if i + 2 < len(lines): l_space_2below = lines[i + 1][2] - lines[i + 2][4] fig = fig_caps[-1] print(l_space, l_space_below, l_space_2below, lines[i][2], lines[i][4], str(line[5])) # capture figure captions multi lines if (fig_caps[-1] == str(lines[i - 1][5]) and -2 * l_height < l_space < 0.5 * l_height): fig_caps.append(str(line[5])) continue # capture headers (up to two lines) if (lines[i][2] > max_y * 0.95 and (l_space_below > 0.5 * l_height or l_space_2below > 0.5 * l_height)): headers.append('\n') headers.append(str(line[5])) if supp_re.search(str(line[5])): headers.append('\n') headers.append(str(line[5])) else: continue # capture supporting info if supp_re.search(str(line[5])): print(str(line[5])) supp_info.append('\n') supp_info.append(str(line[5])) continue if (max_y - min_y) * 0.95 > l_space > 0.5 * l_height: # capture figure captions if re.match(r"^fig", str(line[5]), re.I): fig_caps.append('\n') fig_caps.append(str(line[5])) continue # capture footers elif lines[i][2] < min_y + max_y * 0.015: footers.append('\n') footers.append(str(line[5])) continue else: string = str(lines[i - 1][5]) if (any(string in s for s in fig_caps) or any(string in s for s in headers)): # or #string == footers[-1] or string == supp_info[-1]): pass else: new_lines.append('\n') new_lines.append(str(line[5])) with open(outfile, 'w') as f: f.write(' '.join(new_lines)) f.write('\n\nFigures') f.write(' '.join(fig_caps)) f.write('\n\nTables') #f.write(' '.join(table_caps)) f.write('\n'.join(table_data)) f.write('\n\nHeaders') f.write(' '.join(headers)) f.write('\n\nFooters') f.write(' '.join(footers)) f.write('\n\nSupporting Info') f.write(' '.join(supp_info)) # the histogram of the data # n, bins, patches = plt.hist(x_data, 50) # plt.show() device.close() print('Done') return
def main(fname, k, v): # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-S': stripcontrol = True elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFPageInterpreter.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return
def main(argv): import getopt def usage(): print 'Syntax:\npdf2htm.exe SourcePDF\n where the parameter is either a file name or\na wildcard spec like\n*.pdf\nEnclose it with quotes if it contains a space\n\nAdditional options are supported with named command line parameters as follows:' print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]' ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]' ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]' ' [-t text|html|xml|tag] [-c codec] [-s scale]' ' file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = 'tag' imagewriter = None rotation = 0 layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = False laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'tag' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout for fname in args: l = glob.glob(fname) count = len(l) print 'Converting ' + str(count) + ' from ' + fname + ' to ' + outtype + ' format' for pdf in l: # print pdf d = {'html' : 'htm', 'tag' : 'tag', 'text' : 'txt', 'xml' : 'xml'} ext = '.' + d[outtype] outfile = pdf[0:-4] + ext print outfile outfp = file(outfile, 'wb') if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) device.showpageno = False elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) device.showpageno = False elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) device.showpageno = False elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) device.showpageno = False else: return usage() fp = file(pdf, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() print 'Done' return
def main(argv): import getopt def usage(): print( "usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]" " [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]" " [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation] [-S]" " [-t text|html|xml|tag] [-c codec] [-s scale]" " file ..." % argv[0] ) return 100 try: (opts, args) = getopt.getopt(argv[1:], "dp:m:P:o:CnAVM:L:W:F:Y:O:R:St:c:s:") except getopt.GetoptError: return usage() if not args: return usage() # input option password = b"" pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 stripcontrol = False layoutmode = "normal" codec = "utf-8" pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == "-d": logging.getLogger().setLevel(logging.DEBUG) elif k == "-p": pagenos.update(int(x) - 1 for x in v.split(",")) elif k == "-m": maxpages = int(v) elif k == "-P": password = v elif k == "-o": outfile = v elif k == "-C": caching = False elif k == "-n": laparams = None elif k == "-A": laparams.all_texts = True elif k == "-V": laparams.detect_vertical = True elif k == "-M": laparams.char_margin = float(v) elif k == "-L": laparams.line_margin = float(v) elif k == "-W": laparams.word_margin = float(v) elif k == "-F": laparams.boxes_flow = float(v) elif k == "-Y": layoutmode = v elif k == "-O": imagewriter = ImageWriter(v) elif k == "-R": rotation = int(v) elif k == "-S": stripcontrol = True elif k == "-t": outtype = v elif k == "-c": codec = v elif k == "-s": scale = float(v) # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = "text" if outfile: if outfile.endswith(".htm") or outfile.endswith(".html"): outtype = "html" elif outfile.endswith(".xml"): outtype = "xml" elif outfile.endswith(".tag"): outtype = "tag" if outfile: outfp = open(outfile, "wb") else: outfp = sys.stdout if outfp.encoding is not None: codec = None if outtype == "text": device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == "xml": device = XMLConverter( rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol ) elif outtype == "html": device = HTMLConverter( rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter ) elif outtype == "tag": device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = open(fname, "rb") interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages( fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True ): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return
def main(argv): import getopt def usage(): print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] ' '[-n] [-A] [-D writing_mode] [-M char_margin] [-L line_margin] [-W word_margin] ' '[-c codec] file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAD:M:L:W:c:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None codec = 'utf-8' pageno = 1 showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-D': laparams.writing_mode = v elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-c': codec = v # CMapDB.debug = debug PDFResourceManager.debug = debug PDFDocument.debug = debug PDFParser.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager() if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout device = CourseRegisterParser(rsrcmgr, outfp, codec=codec, laparams=laparams) for fname in args: fp = file(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password) fp.close() device.close() outfp.close() return
def convert_pdf(path, outtype='txt', opts={}): outfile = path[:-3] + outtype outdir = '/'.join(path.split('/')[:-1]) debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option codec = 'utf-8' pageno = 1 scale = 1 showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-D': laparams.writing_mode = v elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # CMapDB.debug = debug PDFResourceManager.debug = debug PDFDocument.debug = debug PDFParser.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager() if not outtype: outtype = 'txt' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'txt': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() fp = file(path, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password) fp.close() device.close() outfp.close() return
def main(argv): def usage(): print(('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] ' '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] ' '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() debug = False # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug = True elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') ) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) if debug: set_debug_logging() rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore') close_outfp = True else: outfp = sys.stdout close_outfp = False if outtype == 'text': device = TextConverter(rsrcmgr, outfp, laparams=laparams) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp) else: return usage() for fname in args: fp = io.open(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() if close_outfp: outfp.close()
def main(argv): def usage(): print(( 'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] ' '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] ' '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() debug = False # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug = True elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) if debug: set_debug_logging() rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore') close_outfp = True else: outfp = sys.stdout close_outfp = False if outtype == 'text': device = TextConverter(rsrcmgr, outfp, laparams=laparams) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp) else: return usage() for fname in args: fp = io.open(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() if close_outfp: outfp.close()
def document(self): def mergeSameParagraphLines(lines): def isEndOfParagraph(line): return line[-1:] in ['.', '?', '!'] or len(line) < 60 result = [] currentLine = '' for line in lines: # print "# '" + line + "'" currentLine += line if isEndOfParagraph(line): result.append(currentLine) currentLine = '' if currentLine != '': result.append(currentLine) return result if not self._document: pdfFile = open(self._pdfDocument, 'rb') pdfParser = PDFParser(pdfFile) document = PDFDocument() pdfParser.set_document(document) document.set_parser(pdfParser) document.initialize() if not document.is_extractable: raise pdfminer.pdfparser.PDFTextExtractionNotAllowed resourceManger = PDFResourceManager() debug = 1 # PDFDocument.debug = debug PDFParser.debug = debug # CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # pdfContent = StringIO() laparams = LAParams() laparams.all_texts = True laparams.detect_vertical = True # laparams.line_margin = 1.0 # laparams.char_margin = 1.0 # laparams.word_margin = 1.0 # laparams.boxes_flow = 1.0 # device = PDFDevice(resourceManger) device = TextConverter(resourceManger, pdfContent, codec='utf-8', laparams=laparams) interpreter = PDFPageInterpreter(resourceManger, device) for page in document.get_pages(): interpreter.process_page(page) content = mergeSameParagraphLines( pdfContent.getvalue().split('\n')) toc = [] try: for (level, title, destination, a, se) in document.get_outlines(): toc.append((level, title)) except: pass pdfContent.close() self._document = Document().initWithDocumentInfo( content, None, None) return self._document