class Miner: def __init__(self, pdf_file, txt_file, file_format='txt', layout_analysis=True): self.pdf_file = file(pdf_file, 'rb') self.outfp = file(txt_file, 'w') if layout_analysis: laparams = LAParams() else: laparams = None self.rsrcmgr = PDFResourceManager(caching=True) if file_format == 'txt': self.device = TextConverter(self.rsrcmgr, self.outfp, codec='utf-8', laparams=laparams, imagewriter=None) elif file_format == 'html': self.device = HTMLConverter(self.rsrcmgr, self.outfp, codec='utf-8', laparams=laparams, imagewriter=None) elif file_format == 'xml': self.device = XMLConverter(self.rsrcmgr, self.outfp, codec='utf-8', laparams=laparams, imagewriter=None) def extract(self): interpreter = PDFPageInterpreter(self.rsrcmgr, self.device) pagenos = set() for page in PDFPage.get_pages(self.pdf_file, pagenos, maxpages=0, password=None, caching=True, check_extractable=True): interpreter.process_page(page) self.pdf_file.close() self.device.close() self.outfp.close()
def __init__(self, pdf_file, txt_file, file_format='txt', layout_analysis=True): self.pdf_file = file(pdf_file, 'rb') self.outfp = file(txt_file, 'w') if layout_analysis: laparams = LAParams() else: laparams = None self.rsrcmgr = PDFResourceManager(caching=True) if file_format == 'txt': self.device = TextConverter(self.rsrcmgr, self.outfp, codec='utf-8', laparams=laparams, imagewriter=None) elif file_format == 'html': self.device = HTMLConverter(self.rsrcmgr, self.outfp, codec='utf-8', laparams=laparams, imagewriter=None) elif file_format == 'xml': self.device = XMLConverter(self.rsrcmgr, self.outfp, codec='utf-8', laparams=laparams, imagewriter=None)
def convert(infile, outfile, rotation=0): debug = 0 password = '' pagenos = set() maxpages = 0 codec = 'utf-8' caching = True laparams = LAParams() PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug rsrcmgr = PDFResourceManager(caching=caching) outfp = open(outfile, 'wb') device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams) fp = open(infile, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close()
def pdf2xml(path, codec='utf-8', password = "", maxpages = 0, caching = True): ''' Given the name of a PDF file, use PDFMiner to extract its pages and return them as XML (in utf-8 bytes). ''' rsrcmgr = PDFResourceManager() retstr = BytesIO() laparams = LAParams() device = XMLConverter(rsrcmgr, retstr, codec='utf-8', laparams=laparams) with open(path, 'rb') as fp: interpreter = PDFPageInterpreter(rsrcmgr, device) pagenos=set() #pg = 1 for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) #xml = '%s %s %s' % ('<PAGE {}>'.format(pg), retstr.getvalue(), '</PAGE {}>'.format(pg)) #pg += 1 xml = retstr.getvalue() device.close() retstr.close() xml = xml.decode('utf-8') if not xml.startswith('</pages>'): xml += '\n</pages>' return xml
def pdf2xml(filename): rsrcmgr = PDFResourceManager(caching=True) outfp = StringIO.StringIO() device = XMLConverter(rsrcmgr, outfp, codec='utf-8', laparams=LAParams(), imagewriter=None) fp = file(filename, 'rb') pages = PDFPage.get_pages(fp, None, maxpages=0, password='', caching=True, check_extractable=True) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in pages: interpreter.process_page(page) fp.close() device.close() xml = outfp.getvalue() outfp.close() return xml
def parse_pdf_to_txt(pdf_handle, write_file): pagenos = set() maxpages = 0 codec = 'utf-8' caching = True laparams = LAParams() #laparams.all_texts = True laparams.detect_vertical = True # 创建pdf资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager(caching=caching) print("ready to open out file ........") with open(write_file, "wt", encoding=codec, errors='ignore') as outfp: device = XMLConverter(rsrcmgr, outfp, laparams=laparams) print("ready to converte pdf to xml ........") process_pdf(rsrcmgr, device, pdf_handle, pagenos, maxpages=maxpages, password='', caching=caching, check_extractable=True) device.close()
def parse_pdfs(pdf_filenames): # Set parameters pagenos = set() maxpages = 0 password = '' imagewriter = None codec = 'utf-8' caching = True laparams = LAParams() rsrcmgr = PDFResourceManager(caching=caching) # Convert to XML as it retains the most information about text position (compared to text, html, etc). for pdf_file in pdf_filenames: print "Converting %s to xml."%pdf_file fname, ext = os.path.splitext(pdf_file) outfile = fname + '.xml' with open(pdf_file, 'rb') as fp, open(outfile, 'w') as outfp: device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) device.close() print "Conversion complete."
def _get_xml_data(self, sourcefile): """Store XML representation fo file""" rm = PDFResourceManager(caching=True, font_correctors=self.font_correctors) laparams = LAParams() outfp = io.BytesIO() device = XMLConverter(rm, outfp, codec="UTF-8", laparams=laparams, imagewriter=None) interpreter = PDFPageInterpreter(rm, device) infile = open(sourcefile, "rb") pagenos = set() maxpages = 0 rotation = 0 password = "" for page in PDFPage.get_pages(infile, pagenos, maxpages=maxpages, password=password, caching=True, check_extractable=True): interpreter.process_page(page) infile.close() device.close() retval = outfp.getvalue() outfp.close() return retval
def extract_pdf_page(filename, page_number_or_numbers): """Given the name of a PDF file and the pages to extract, use PDFMiner to extract those pages and return them as XML (in utf-8 bytes). The param page_number_or_numbers can be a single page number or an iterable thereof. """ # This code adapted from pdf2txt.py which is part of PDFMiner. # Here's the command line version of the code below -- # pdf2txt.py -p 1 -o expected.xml sample.pdf if is_iterable(page_number_or_numbers): page_numbers = page_number_or_numbers else: page_numbers = [page_number_or_numbers] f_out = StringIO.StringIO() laparams = LAParams() rsrcmgr = PDFResourceManager() device = XMLConverter(rsrcmgr, f_out, codec='utf-8', laparams=laparams) with open(filename, 'rb') as f_in: interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(f_in, page_numbers): interpreter.process_page(page) device.close() xml = f_out.getvalue() f_out.close() return xml
def to_xml(infile): output = StringIO() manager = PDFResourceManager() converter = XMLConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) for page in PDFPage.get_pages(infile): interpreter.process_page(page) converter.close() xml = output.getvalue() output.close return xml
def lerPDF(arquivo): recursos = PDFResourceManager() buffer = StringIO() layoutParams = LAParams() disp = XMLConverter(recursos, buffer, laparams=layoutParams) process_pdf(recursos, disp, arquivo) disp.close() conteudo = buffer.getvalue() buffer.close() return conteudo
def getTitle(self, stream): stream.seek(0) input1 = PdfFileReader(stream) title = input1.getDocumentInfo().title # if fail to get thesis's title , we deal with it by using a special algorithm. if title in ['untitled', '']: from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfparser import PDFParser from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfdevice import PDFDevice, TagExtractor from pdfminer.pdfpage import PDFPage from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter from pdfminer.layout import LAParams try: from cStringIO import StringIO except ImportError: from StringIO import StringIO # init parameters caching = True codec = 'utf-8' imagewriter = None stripcontrol = False pagenos = set() password = '' maxpages = 0 rotation = 0 rsrcmgr = PDFResourceManager(caching=caching) laparams = LAParams() outfp = StringIO() # convert pdf to xml, using StringIO to store XML device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) interpreter = PDFPageInterpreter(rsrcmgr, device) pagenos.update(int(x) - 1 for x in '1'.split(',')) stream.seek(0) for page in PDFPage.get_pages(stream, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) device.close() outfp.seek(0) # parse the xml to get title title = self._getTitleFromXmlStr(outfp.read().encode(codec)) return title
def extract_pdf_page(filename): # Paths for creating folder and file input_file_name = Path(filename).stem output_file_folder = Path(XML_PATH, input_file_name) output_file_folder.mkdir(parents=True, exist_ok=True) output_file_path = Path(output_file_folder, input_file_name + "-" + TIME_NOW + ".xml") output_images_path = Path(XML_PATH, input_file_name, "images") output_images_path.mkdir(parents=True, exist_ok=True) output_file = io.StringIO() laparams = LAParams() rsrcmgr = PDFResourceManager() device = XMLConverter(rsrcmgr, output_file, laparams=laparams) doc = fitz.open(filename) for i in range(len(doc)): for img in doc.getPageImageList(i): xref = img[0] pix = fitz.Pixmap(doc, xref) if pix.n < 5: # this is GRAY or RGB pix.writePNG( str(output_images_path) + "//" + "%s-%s-%s.png" % (input_file_name, i, xref)) else: # CMYK: convert to RGB first pix1 = fitz.Pixmap(fitz.csRGB, pix) pix1.writePNG( str(output_images_path) + "//" + "%s-%s.png" % (input_file_name, i, xref)) pix1 = None pix = None with open(filename, 'rb') as fh: interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fh, caching=True, check_extractable=True): interpreter.process_page(page) device.close() xml = output_file.getvalue() with open(output_file_path, 'w', encoding="utf-8") as fd: fd.write(xml) output_file.close() return xml
def __init__(self, src, limit = None): if(len(src) < 5 or src[(len(src) - 4):(len(src))] != ".pdf"): raise Exception("PDF file has to end in .pdf and has to have a name!") input_file = open(src, "rb") out = StringIO() rsrc = PDFResourceManager() device = XMLConverter(rsrc, out, codec='UTF-8', laparams=None) try: process_pdf(rsrc, device, input_file, pagenos=None, maxpages=limit, password='', check_extractable=True) finally: device.close() input_file.close() text = out.getvalue() out.close() self.text = self.cleanText(text)
def convert_pdf(path, format='text', codec='utf-8', password=''): rsrcmgr = PDFResourceManager() retstr = BytesIO() laparams = LAParams() if format == 'text': device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) elif format == 'html': device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) elif format == 'xml': device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) else: raise ValueError('provide format, either text, html or xml!') fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue().decode() soup = bs(text) prettyHTML = soup.prettify() print(prettyHTML) # html_file = open("../../Data/document-page0.xml", "w") # html_file.write(prettyHTML) # html_file.close() # fp.close() # device.close() # retstr.close() return text
def convert_pdf(path, format='text', codec='utf-8', password=''): r = requests.get(path) f = io.BytesIO(r.content) rsrcmgr = PDFResourceManager() retstr = BytesIO() laparams = LAParams() if format == 'text': device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) elif format == 'html': device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) elif format == 'xml': device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) else: raise ValueError('provide format, either text, html or xml!') fp = io.BytesIO(f.getvalue()) interpreter = PDFPageInterpreter(rsrcmgr, device) maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue().decode() fp.close() device.close() retstr.close() return text
def pdf_to_string(path, format='xml', password=''): rsrcmgr = PDFResourceManager() out_stream = BytesIO() laparams = LAParams() if format == 'text': device = TextConverter(rsrcmgr, out_stream, laparams=laparams) elif format == 'html': device = HTMLConverter(rsrcmgr, out_stream, laparams=laparams) elif format == 'xml': device = XMLConverter(rsrcmgr, out_stream, laparams=laparams) else: raise ValueError('provide format, either text, html or xml!') fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() text = out_stream.getvalue().decode("utf-8") out_stream.close() return text
def convert_pdf(path, outp, format='txt', codec='utf-8', password=''): rsrcmgr = PDFResourceManager() laparams = LAParams() outf = open(outp + '.' + format, 'wb') if format == 'txt': device = TextConverter(rsrcmgr, outf, codec=codec, laparams=laparams) elif format == 'html': device = HTMLConverter(rsrcmgr, outf, codec=codec, laparams=laparams) elif format == 'xml': device = XMLConverter(rsrcmgr, outf, codec=codec, laparams=laparams) else: raise ValueError('provide format, either text, html or xml!') fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() outf.close()
def getTitle(self,stream): stream.seek(0) input1 = PdfFileReader(stream) title = input1.getDocumentInfo().title # if fail to get thesis's title , we deal with it by using a special algorithm. if title in ['untitled','']: from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfparser import PDFParser from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.pdfdevice import PDFDevice, TagExtractor from pdfminer.pdfpage import PDFPage from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter from pdfminer.layout import LAParams try: from cStringIO import StringIO except ImportError: from StringIO import StringIO # init parameters caching = True codec = 'utf-8' imagewriter = None stripcontrol = False pagenos = set() password = '' maxpages = 0 rotation = 0 rsrcmgr = PDFResourceManager(caching=caching) laparams = LAParams() outfp = StringIO() # convert pdf to xml, using StringIO to store XML device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol) interpreter = PDFPageInterpreter(rsrcmgr, device) pagenos.update( int(x)-1 for x in '1'.split(',') ) stream.seek(0) for page in PDFPage.get_pages(stream, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate+rotation) % 360 interpreter.process_page(page) device.close() outfp.seek(0) # parse the xml to get title title = self._getTitleFromXmlStr(outfp.read().encode(codec)) return title
def pdf_to_xml(pdfpath): rsrcmgr = PDFResourceManager() sio = StringIO() laparams = LAParams() device = XMLConverter(rsrcmgr, sio, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) fp = open(pdfpath, 'rb') for page in PDFPage.get_pages(fp): interpreter.process_page(page) fp.close() text = sio.getvalue() device.close() sio.close() return text
def convert(fname, pages=None): if not pages: pagenums = set() else: pagenums = set(pages) output = BytesIO() manager = PDFResourceManager() converter = XMLConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = open(fname, "rb") for page in PDFPage.get_pages(infile, pagenums): interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() output.close() return text
def trasformaPDFinXML(doc, directoryOutfile): # PDF Miner solo parte che mi serve (trasforma PDF in XML) password = "" pagenos = [] i = 0 codec ="utf-8" laparams = None imagewriter = None doc = doc.replace(" ","") fp = open(doc, 'rb') doc = doc.replace("/", "_") pos = doc.find(".pdf") doc = doc[: pos] doc ="outfile "+ doc # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser, password) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() outfp = file(directoryOutfile+"/"+doc,"w") device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. for page in PDFPage.create_pages(document): interpreter.process_page(page) fp.close() device.close() outfp.close() return doc
def parse_pdfs(pdf_filenames): # Set parameters pagenos = set() maxpages = 0 password = '' imagewriter = None codec = 'utf-8' caching = True laparams = LAParams() rsrcmgr = PDFResourceManager(caching=caching) # Convert to XML as it retains the most information about text position (compared to text, html, etc). for pdf_file in pdf_filenames: print "Converting %s to xml." % pdf_file fname, ext = os.path.splitext(pdf_file) outfile = fname + '.xml' with open(pdf_file, 'rb') as fp, open(outfile, 'w') as outfp: device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) device.close() print "Conversion complete."
def convert_pdf_to_xml(path): from pdfminer.converter import XMLConverter rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) fp = file(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) password = "" maxpages = 0 caching = True pagenos=set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,password=password,caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue() fp.close() device.close() retstr.close() return text
def get_xml_data(self): """Store XML representation fo file""" rm = PDFResourceManager(caching=True, font_correctors=self.font_correctors) laparams = LAParams() outfp = open(self.xmlfile, "wb") device = XMLConverter(rm, outfp, codec="UTF-8", laparams=laparams, imagewriter=None) interpreter = PDFPageInterpreter(rm, device) infile = open(self.pdffile, "rb") pagenos = set() maxpages = 0 rotation = 0 password = "" for page in PDFPage.get_pages(infile, pagenos, maxpages=maxpages, password=password, caching=True, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) self.font_metrics = {} for font in list(rm._cached_fonts.values()): try: self.font_metrics[font.fontname] = { "bbox": font.bbox, "descent": font.descent } except AttributeError: print((dir(font))) infile.close() device.close() outfp.close()
def convert_xml(inf, outf, page_numbers=None, output_type='xml', codec='utf-8', laparams=None, maxpages=0, scale=1.0, rotation=0, output_dir=None, strip_control=False, debug=False, disable_caching=False): laparams = LAParams() imagewriter = None if output_dir: imagewriter = ImageWriter(output_dir) rsrcmgr = PDFResourceManager(caching=not disable_caching) device = XMLConverter(rsrcmgr, outf, codec='utf-8', laparams=laparams, imagewriter=imagewriter) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(inf, page_numbers, maxpages=maxpages, caching=not disable_caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) device.close() return page
def reinit(self): rsrcmgr = PDFResourceManager() retstr = BytesIO() laparams = LAParams() if self.format == 'text': device = TextConverter(rsrcmgr, retstr, codec=self.codec, laparams=laparams) elif self.format == 'html': device = HTMLConverter(rsrcmgr, retstr, codec=self.codec, laparams=laparams) elif self.format == 'xml': device = XMLConverter(rsrcmgr, retstr, codec=self.codec, laparams=laparams) elif self.format == 'filter': device = PDFPageAggregator(rsrcmgr, laparams=laparams) else: raise ValueError('provide format, either text, html or xml!') interpreter = PDFPageInterpreter(rsrcmgr, device) return {'retstr': retstr, 'device': device, 'interpreter': interpreter}
def run(self): rsrcmgr = PDFResourceManager(caching=self._caching) if not self._outtype: self._outtype = 'text' if __name__ == '__main__': if self._outfile: if self._outfile.endswith('.htm') or self._outfile.endswith('.html'): self._outtype = 'html' elif self._outfile.endswith('.xml'): self._outtype = 'xml' elif self._outfile.endswith('.tag'): self._outtype = 'tag' if __name__ == '__main__': if self._outfile: outfp = file(self._outfile, 'w') else: outfp = sys.stdout else: from cStringIO import StringIO outfp = StringIO() if self._outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=self._codec, laparams=self._laparams, imagewriter=self._imagewriter) elif self._outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=self._codec, laparams=self._laparams, imagewriter=self._imagewriter) elif self._outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=self._codec, scale=self._scale, layoutmode=self._layoutmode, laparams=self._laparams, imagewriter=self._imagewriter) elif self._outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=self._codec) else: return usage() for fname in self._args: fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, self._pagenos, maxpages=self._maxpages, password=self._password, caching=self._caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() if __name__ == '__main__': outfp.close() else: return outfp.getvalue()
def getPDFText(pdfFilenamePath): retstr = StringIO() parser = PDFParser(open(pdfFilenamePath, 'r')) try: document = PDFDocument(parser) except Exception as e: print(pdfFilenamePath, 'is not a readable pdf') return '' if document.is_extractable: rsrcmgr = PDFResourceManager() device = XMLConverter(rsrcmgr, retstr, codec='ascii', laparams=LAParams()) interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.create_pages(document): interpreter.process_page(page) return retstr.getvalue() else: print(pdfFilenamePath, "Warning: could not extract text from pdf file.") return ''
def convert_pdf(format='html', codec='utf-8', password=''): pdf_folder = '/home/bichitra/Desktop/project/pdf/' file_name = '1c1edeee-a13e-4b2e-90be-eb1dd03c3384.pdf' # file_name = 'EICHERMOT.pdf' file_name = pdf_folder + file_name rsrcmgr = PDFResourceManager() retstr = BytesIO() laparams = LAParams() if format == 'text': device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) elif format == 'html': device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) elif format == 'xml': device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) else: raise ValueError('provide format, either text, html or xml!') fp = open(file_name, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue().decode() fp.close() device.close() retstr.close() soup = BeautifulSoup(text, 'html.parser') htmlfile = open(file_name + '.html', 'w+') htmlfile.write(str(soup)) htmlfile.flush() htmlfile.close() return soup
def convert(fname,Converter='HTML',pages=None,write=False): ''' Converter: 'HTML','Text','XML' pages: [beginPage, endPage] ''' if not pages: pagenums = set() else: pages = map((lambda x:x-1),pages) pagenums = set(pages) output = StringIO() manager = PDFResourceManager() if Converter == 'HTML': converter = HTMLConverter(manager, output, laparams=LAParams()) elif Converter == 'Text': converter = TextConverter(manager, output, laparams=LAParams()) elif Converter == 'XML': converter = XMLConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = file(fname, 'rb') for page in PDFPage.get_pages(infile, pagenums): interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() output.close if write: if Converter=='HTML': format = '.html' elif Converter=='Text': format = '.txt' elif Converter == 'XML': format = '.xml' writeFile(fname,text,format) else: return text
def convert_pdf(path, format='html', codec='utf-8', password=''): rsrcmgr = PDFResourceManager() retstr = BytesIO() laparams = LAParams() if format == 'text': device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) elif format == 'html': device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) elif format == 'xml': device = XMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) else: raise ValueError('provide format, either text, html or xml!') fp = open(path, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) maxpages = 0 caching = True pagenos = set() for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) text = retstr.getvalue().decode() fp.close() device.close() retstr.close() return text #text = convert_pdf('python cv.pdf', 'text') # #with open('python cv.txt', 'w', encoding='utf-8') as f: # f.write(text)
def main(argv): def usage(): print(( 'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] ' '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] ' '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() debug = False # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None outdir = None layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug = True elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': outdir = v elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) if debug: set_debug_logging() rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore') close_outfp = True else: outfp = sys.stdout close_outfp = False if outtype == 'text': device = TextConverter(rsrcmgr, outfp, laparams=laparams) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, laparams=laparams, outdir=outdir) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir, debug=debug) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp) else: return usage() for fname in args: fp = io.open(fname, 'rb') process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True) fp.close() device.close() if close_outfp: outfp.close()
def main(argv): import getopt # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return
def main(argv): import getopt def usage(): print( 'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]' ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]' ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]' ' [-t text|html|xml|tag] [-c codec] [-s scale]' ' file ...' % argv[0]) return 100 try: (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:') except getopt.GetoptError: return usage() if not args: return usage() # debug option debug = 0 # input option password = '' pagenos = set() maxpages = 0 # output option outfile = None outtype = None imagewriter = None rotation = 0 layoutmode = 'normal' codec = 'utf-8' pageno = 1 scale = 1 caching = True showpageno = True laparams = LAParams() for (k, v) in opts: if k == '-d': debug += 1 elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(',')) elif k == '-m': maxpages = int(v) elif k == '-P': password = v elif k == '-o': outfile = v elif k == '-C': caching = False elif k == '-n': laparams = None elif k == '-A': laparams.all_texts = True elif k == '-V': laparams.detect_vertical = True elif k == '-M': laparams.char_margin = float(v) elif k == '-L': laparams.line_margin = float(v) elif k == '-W': laparams.word_margin = float(v) elif k == '-F': laparams.boxes_flow = float(v) elif k == '-Y': layoutmode = v elif k == '-O': imagewriter = ImageWriter(v) elif k == '-R': rotation = int(v) elif k == '-t': outtype = v elif k == '-c': codec = v elif k == '-s': scale = float(v) # PDFDocument.debug = debug PDFParser.debug = debug CMapDB.debug = debug PDFResourceManager.debug = debug PDFPageInterpreter.debug = debug PDFDevice.debug = debug # rsrcmgr = PDFResourceManager(caching=caching) if not outtype: outtype = 'text' if outfile: if outfile.endswith('.htm') or outfile.endswith('.html'): outtype = 'html' elif outfile.endswith('.xml'): outtype = 'xml' elif outfile.endswith('.tag'): outtype = 'tag' if outfile: outfp = file(outfile, 'w') else: outfp = sys.stdout if outtype == 'text': device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'xml': device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) elif outtype == 'html': device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter) elif outtype == 'tag': device = TagExtractor(rsrcmgr, outfp, codec=codec) else: return usage() for fname in args: fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 interpreter.process_page(page) fp.close() device.close() outfp.close() return
def main(argv=None): parser = argparse.ArgumentParser(description='Convert PDF into text.') parser.add_argument('file', nargs='*', type=argparse.FileType('rb'), default=sys.stdin, help='file(s) to convert') parser.add_argument('-C', '--nocache', dest='cache', action='store_false', help='prevent object caching (slower)') parser.add_argument('-l', metavar='level', default='warn', help='logging level (warn, info, debug)') parser.add_argument('-p', metavar='page', nargs='+', default=[], type=int, help='page number(s) (space separated)') parser.add_argument('-m', metavar='maxpages', default=0, type=int, help='maximum number of pages to extract') parser.add_argument('-P', metavar='password', default='', help='pdf password') parser.add_argument('-o', metavar='outfile', type=argparse.FileType('w'), default=sys.stdout, help='output file name (default: stdout)') parser.add_argument('-O', metavar='directory', type=ImageWriter, help='extract images and save to directory') parser.add_argument('-t', metavar='outtype', help='output type (text, html, xml, tag)') parser.add_argument('-c', metavar='codec', default='utf-8', help='output text encoding (default: %(default)s)') lagroup = parser.add_argument_group(title='layout analysis') lagroup.add_argument('-n', action='store_true', help='disable layout analysis') lagroup.add_argument('-A', action='store_true', help='force layout analysis on all text') lagroup.add_argument('-V', action='store_true', help='detect vertical text') lagroup.add_argument('-M', metavar='char_margin', type=float, help='custom character margin') lagroup.add_argument('-L', metavar='line_margin', type=float, help='custom line margin') lagroup.add_argument('-W', metavar='word_margin', type=float, help='custom word margin') lagroup.add_argument('-F', metavar='boxes_flow', type=float, help='custom boxes flow') lagroup.add_argument('-Y', metavar='layout_mode', default='normal', help='layout mode for HTML (normal, exact, loose)') lagroup.add_argument('-s', metavar='scale', default=1, type=float, help='output scaling for HTML') args = parser.parse_args(argv) logging.basicConfig() logging.getLogger('pdfminer').setLevel(args.l.upper()) laparams = LAParams() if args.n: laparams = None else: laparams.all_texts = args.A laparams.detect_vertical = args.V if args.M: laparams.char_margin = args.M if args.L: laparams.line_margin = args.L if args.W: laparams.word_margin = args.W if args.F: laparams.boxes_flow = args.F rsrcmgr = PDFResourceManager(caching=args.cache) outtype = args.t if not outtype: if args.o: if args.o.name.endswith('.htm') or args.o.name.endswith('.html'): outtype = 'html' elif args.o.name.endswith('.xml'): outtype = 'xml' elif args.o.name.endswith('.tag'): outtype = 'tag' if outtype == 'xml': device = XMLConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O) elif outtype == 'html': device = HTMLConverter(rsrcmgr, args.o, codec=args.c, scale=args.s, layoutmode=args.Y, laparams=laparams, imagewriter=args.O) elif outtype == 'tag': device = TagExtractor(rsrcmgr, args.o, codec=args.c) else: device = TextConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O) for fp in args.file: process_pdf(rsrcmgr, device, fp, [i - 1 for i in args.p], maxpages=args.m, password=args.P, caching=args.cache, check_extractable=True) fp.close() device.close() if args.o is not sys.stdout: args.o.close()
if fname[-3:] == "pdf": # Set parameters pagenos = set() maxpages = 0 password = '' imagewriter = None codec = 'utf-8' caching = True laparams = LAParams() outfile = fname + '.txt' rsrcmgr = PDFResourceManager(caching=caching) outfp = file(outfile, 'w') device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter) fp = file(fname, 'rb') interpreter = PDFPageInterpreter(rsrcmgr, device) for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True): interpreter.process_page(page) fp.close() device.close() outfp.close()
def main(argv=None): parser = argparse.ArgumentParser(description='Convert PDF into text.') parser.add_argument('file', nargs='*', type=argparse.FileType('rb'), default=sys.stdin, help='file(s) to convert') parser.add_argument('-C', '--nocache', dest='cache', action='store_false', help='prevent object caching (slower)') parser.add_argument('-l', metavar='level', default='warn', help='logging level (warn, info, debug)') parser.add_argument('-p', metavar='page', nargs='+', default=[], type=int, help='page number(s) (space separated)') parser.add_argument('-m', metavar='maxpages', default=0, type=int, help='maximum number of pages to extract') parser.add_argument('-P', metavar='password', default='', help='pdf password') parser.add_argument('-o', metavar='outfile', type=argparse.FileType('w'), default=sys.stdout, help='output file name (default: stdout)') parser.add_argument('-O', metavar='directory', type=ImageWriter, help='extract images and save to directory') parser.add_argument('-t', metavar='outtype', help='output type (text, html, xml, tag)') parser.add_argument('-c', metavar='codec', default='utf-8', help='output text encoding (default: %(default)s)') lagroup = parser.add_argument_group(title='layout analysis') lagroup.add_argument('-n', action='store_true', help='disable layout analysis') lagroup.add_argument('-A', action='store_true', help='force layout analysis on all text') lagroup.add_argument('-V', action='store_true', help='detect vertical text') lagroup.add_argument('-M', metavar='char_margin', type=float, help='custom character margin') lagroup.add_argument('-L', metavar='line_margin', type=float, help='custom line margin') lagroup.add_argument('-W', metavar='word_margin', type=float, help='custom word margin') lagroup.add_argument('-F', metavar='boxes_flow', type=float, help='custom boxes flow') lagroup.add_argument('-Y', metavar='layout_mode', default='normal', help='layout mode for HTML (normal, exact, loose)') lagroup.add_argument('-s', metavar='scale', default=1, type=float, help='output scaling for HTML') args = parser.parse_args(argv) logging.basicConfig() logging.getLogger('pdfminer').setLevel(args.l.upper()) laparams = LAParams() if args.n: laparams = None else: laparams.all_texts = args.A laparams.detect_vertical = args.V if args.M: laparams.char_margin = args.M if args.L: laparams.line_margin = args.L if args.W: laparams.word_margin = args.W if args.F: laparams.boxes_flow = args.F rsrcmgr = PDFResourceManager(caching=args.cache) outtype = args.t if not outtype: if args.o: if args.o.name.endswith('.htm') or args.o.name.endswith('.html'): outtype = 'html' elif args.o.name.endswith('.xml'): outtype = 'xml' elif args.o.name.endswith('.tag'): outtype = 'tag' if outtype == 'xml': device = XMLConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O) elif outtype == 'html': device = HTMLConverter(rsrcmgr, args.o, codec=args.c, scale=args.s, layoutmode=args.Y, laparams=laparams, imagewriter=args.O) elif outtype == 'tag': device = TagExtractor(rsrcmgr, args.o, codec=args.c) else: device = TextConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O) for fp in args.file: process_pdf(rsrcmgr, device, fp, [i-1 for i in args.p], maxpages=args.m, password=args.P, caching=args.cache, check_extractable=True) fp.close() device.close() if args.o is not sys.stdout: args.o.close()