def getPageLayouts(f1):
    '''Takes a pdf file object, f1, extracts the text-like objects, and returns'''
    try:
        '''The parser and doc pair for a "pipe" of sorts'''
        with open(fpath, 'rb') as f1:
            parser = PDFParser(f1)
            doc = PDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
            doc.initialize(pss_wd)

            # can we extract text?
            if doc.is_extractable:
                rsrcmgr = PDFResourceManager()
                laparams = LAParams()
                device = PDFPageAggregator(rsrcmgr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)

                page_layouts = []
                for page in doc.get_pages():
                    '''
                    I *think* we're actually calling on fp here, and not some
                    stored data; the idea is that .pdf files are "too big and
                    complicated" to load all at once, so why not just parse
                    what you need when you need it?
                    '''
                    interpreter.process_page(page)
                    # receive the LTPage object for the page
                    page_layouts.append(device.get_result())
    except IOError:
        raise IOError, "issue with loading file, please try again"
    finally:
        f1.close()
        return page_layouts
Exemple #2
0
def read_invoice_pdfminer3k(pdfFile):
    fp = open(os.path.join(invoice_path + "\\" + pdfFile), "rb")

    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)

    doc.initialize("")
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()

    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Process each page contained in the document.
    invoice_text = ""
    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                invoice_text += lt_obj.get_text()

    # Extract client info from the string extracted from pdf
    client = extract_info(invoice_text, client_start, client_end)
    print("client :" + client)

    # Extract invoice no from the pdf file name
    invoice_no = extract_info(str(pdfFile), invoice_start, invoice_end)
    print("invoice no :" + invoice_no)

    # Pass the client info and invoice no to the method which writes to excel file
    write_excel(client, invoice_no)
 def ParseAllPages(self, filepath):
     # Open a PDF file.
     self.filepath = filepath
     fp = open(filepath, 'rb')
     # Create a PDF parser object associated with the file object.
     parser = PDFParser(fp)
     # Create a PDF document object that stores the document structure.
     doc = PDFDocument()
     # Connect the parser and document objects.
     parser.set_document(doc)
     doc.set_parser(parser)
     # Supply the password for initialization.
     # (If no password is set, give an empty string.)
     password = ""
     doc.initialize(password)
     # Check if the document allows text extraction. If not, abort.
     if not doc.is_extractable:
         raise PDFTextExtractionNotAllowed
     # Create a PDF resource manager object that stores shared resources.
     rsrcmgr = PDFResourceManager()
     # Create a PDF device object.
     device = PDFDevice(rsrcmgr)
     # Create a PDF interpreter object.
     interpreter = PDFPageInterpreter(rsrcmgr, device)
     # Process each page contained in the document.
     for page in doc.get_pages():
         interpreter.process_page(page)
 def WithPdf(self, pdfdoc, password, fn, *args):
     """Open the pdf document, and apply the function, returning the results"""
     result = None
     try:
         # open the pdf file
         fp = open(pdfdoc, 'rb')
         # create a parser object associated with the file object
         parser = PDFParser(fp)
         # create a PDFDocument object that stores the document structure
         doc = PDFDocument()
         # connect the parser and document objects
         parser.set_document(doc)
         doc.set_parser(parser)
         # supply the password for initialization
         if password:
             self.password = password
         doc.initialize(self.password)
 
         if doc.is_extractable:
             # apply the function and return the result
             result = fn(doc, *args)
 
         # close the pdf file
         fp.close()
     except IOError:
         # the file doesn't exist or similar problem
         pass
     return result
    def create_pages(self):
        """Apply parsing function, returning the results"""

        from public_project.models import Page
        # create a parser object associated with the file object
        parser = PDFParser(self.pdf_file)
        # create a PDFDocument object that stores the document structure
        doc = PDFDocument()
        # connect the parser and document objects
        parser.set_document(doc)
        doc.set_parser(parser)
        # supply the password for initialization
        pdf_pwd = ''
        doc.initialize(pdf_pwd)

        if doc.is_extractable:
            # apply the function and return the result
            doc_pages = self._parse_pages(doc)

        i = 1
        for doc_page in doc_pages:
            page = Page(
                document=self.document,
                number=i,
                content = smart_unicode(doc_page, encoding='utf-8', strings_only=False, errors='strict'),
            )
            page.save()
            i = i + 1
Exemple #6
0
    def pdf_function(pdf_doc, password='', *args, **kwargs):
        result = None
        try:
            # open the pdf file
            fp = open(pdf_doc, 'rb')
            # create a parser object associated with the file object
            parser = PDFParser(fp)
            # create a PDFDocument object that stores the document structure
            doc = PDFDocument()
            # connect the parser and document objects
            parser.set_document(doc)
            doc.set_parser(parser)
            # supply the password for initialization
            doc.initialize(password)

            if doc.is_extractable:
                # apply the function and return the result
                result = function(doc, *args, **kwargs)

            # close the pdf file
            fp.close()
        except IOError:
            # the file doesn't exist or similar problem
            pass
        return result
Exemple #7
0
def pdf_to_csv(filename):
    # ... the following part of the code is a remix of the
    # convert() function in the pdfminer/tools/pdf2text module
    rsrc = PDFResourceManager()
    outfp = StringIO()
    device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())
    # becuase my test documents are utf-8 (note: utf-8 is the default codec)

    doc = PDFDocument()
    fp = open(filename, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')

    interpreter = PDFPageInterpreter(rsrc, device)
    for i, page in enumerate(doc.get_pages()):
        outfp.write("START PAGE %d\n" % i)
        if page is not None:
            interpreter.process_page(page)
        outfp.write("END PAGE %d\n" % i)

    device.close()
    fp.close()

    return outfp.getvalue()
    def _GetFromPdf(self,pdf):
        '''
        参考文档http://www.unixuser.org/~euske/python/pdfminer/programming.html
        '''
        pass
        fp = open(pdf, 'rb')
        #用文件对象来创建一个pdf文档分析器
        parser = PDFParser(fp)
        # 创建一个  PDF 文档
        doc = PDFDocument(parser)
        # 连接分析器 与文档对象
        parser.set_document(doc)
        # 检测文档是否提供txt转换,不提供就忽略
        if not doc.is_extractable:
            raise PDFTextExtractionNotAllowed

        # 创建PDf 资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
            # receive the LTPage object for the page.
            layout = device.get_result()
            for x in layout:
                if(isinstance(x, LTTextContainer)):
                    print x.get_text()

        pass
Exemple #9
0
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

    fp = file(path, 'rb')

    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    parser.set_document(doc)

    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text
Exemple #10
0
def dumppdf(outfp, fname, objids, pagenos, password='',
            dumpall=False, codec=None):
    doc = PDFDocument()
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)
    if objids:
        for objid in objids:
            obj = doc.getobj(objid)
            dumpxml(outfp, obj, codec=codec)
    if pagenos:
        for (pageno,page) in enumerate(doc.get_pages()):
            if pageno in pagenos:
                if codec:
                    for obj in page.contents:
                        obj = stream_value(obj)
                        dumpxml(outfp, obj, codec=codec)
                else:
                    dumpxml(outfp, page.attrs)
    if dumpall:
        dumpallobjs(outfp, doc, codec=codec)
    if (not objids) and (not pagenos) and (not dumpall):
        dumptrailers(outfp, doc)
    fp.close()
    if codec not in ('raw','binary'):
        outfp.write('\n')
    return
Exemple #11
0
    def parse(self, path):
		out = StringIO.StringIO()
		fp = None
        # Directory
		if os.path.isdir(path):
			raise NotImplementedError()
        # File
	       	else:
			fp = file(path)		
		rsrc = PDFResourceManager()
		codec = 'utf-8'
		laparams = LAParams()
		laparams.char_margin = 2.0
		laparams.line_margin = 2.0
		laparams.word_margin = 0.0
		device = TextConverter(rsrc, out, codec=codec, laparams=laparams)
		doc = PDFDocument()
		parser = PDFParser(fp)
		parser.set_document(doc)
		doc.set_parser(parser)
		doc.initialize()
		interpreter = PDFPageInterpreter(rsrc, device)
		for page in doc.get_pages():
			interpreter.process_page(page)
		device.close()
		sample = Sample(path, None, out.getvalue())
		out.close()
		return sample
def pdf_to_text(page_object):
    parser = PDFParser(page_object)
    # Create a PDF document object that stores the document structure
    doc = PDFDocument(parser)
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.initialize('')
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF page aggregator object
    device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    text_content = []
    # i = page number #without this it doesn't work
    # page are items in page
    for i, page in enumerate(PDFPage.create_pages(doc)):
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        for object in layout:
            if isinstance(object, LTTextBox) or isinstance(object, LTTextLine):
                trial = []
                trial.append(object.get_text())
                for word in trial:
                    text_content.append(word)                    
    return text_content
Exemple #13
0
def get_pdf_metadata(fileOrUrl, textmode=False, prefix='', basicauth=None):
    if len(args) > 1:
        prefix = fileOrUrl + ':'
    fp = None
    if fileOrUrl.startswith('http://') or fileOrUrl.startswith('https://'):
        request = urllib2.Request(fileOrUrl)
        if basicauth:
            request.add_header('Authorization', 'Basic ' + basicauth)
        fobj = urllib2.urlopen(request)
        pdfdata = fobj.read()
        fobj.close()
        fp = StringIO.StringIO(pdfdata)
    else:
        fp = open(fileOrUrl, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize()
    fp.close()
    if textmode:
        for obj in doc.info:
            for (name, val) in obj.iteritems():
                print '{0}:{1}={2}'.format(
                    fileOrUrl, name, val
                )
    else:
        val = doc.info
        if type(val) is list and len(val) == 1:
            val = val[0]
        print prefix + str(val)
Exemple #14
0
	def getData(self):
		doc = PDFDocument()
		fp = file(self.fname, 'rb')
		parser = PDFParser(fp)
		try:
			parser.set_document(doc)
			doc.set_parser(parser)
			doc.initialize(self.password)
		except:
			return "error"
		
		parser.close()
		fp.close()
		#try:
		#	metadata = resolve1(doc.catalog['Metadata'])
		#	return "ok"
		#except:
		#	print "[x] Error in PDF extractor, Metadata catalog"
		try:
			for xref in doc.xrefs:
				info_ref=xref.trailer.get('Info')
				if info_ref:
					info=resolve1(info_ref)
				self.metadata=info
				self.raw = info
			if self.raw == None:
				return "Empty metadata"
			else:
				return "ok"
		except Exception,e:
			return e 
			print "\t [x] Error in PDF extractor, Trailer Info"
Exemple #15
0
    def parse_pdf_pdfminer(self, f, fpath):
        try:
            laparams = LAParams()
            laparams.all_texts = True  
            rsrcmgr = PDFResourceManager()
            pagenos = set()

            if self.dedup:
                self.dedup_store = set()

            self.handler.print_header(fpath)
            page_num = 0
            parser= PDFParser(f)
            doc = PDFDocument(caching=True)

            parser.set_document(doc)
            doc.set_parser(parser)
            for page in doc.get_pages():
                retstr = StringIO()
                device = TextConverter(rsrcmgr, retstr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                page_num += 1
                interpreter.process_page(page)
                data = retstr.getvalue()
                self.parse_page(fpath, bytes(data,'UTF-8'), page_num)
                retstr.close()
            self.handler.print_footer(fpath)
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            self.handler.print_error(fpath, e)
Exemple #16
0
def initialize_pdf_miner(fh):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fh)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument(parser)
    # Connect the parser and document objects.
    parser.set_document(doc)
    #doc.set_parser(parser)
    # Supply the password for initialization.
    # (If no password is set, give an empty string.)
    #doc.initialize("")
    # Check if the document allows text extraction. If not, abort.
    if not doc.is_extractable:
        pass
        #raise ValueError("PDFDocument is_extractable was False.")
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    # for page in doc.get_pages():
    #    interpreter.process_page(page)

    # Set parameters for analysis.
    laparams = LAParams(line_overlap=0.3, char_margin=1.0, line_margin=0.5, word_margin=0.1,
            boxes_flow=0.1, detect_vertical=False, all_texts=False)
    laparams.word_margin = 0.0
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    return doc, interpreter, device
def parse_pdf(pdf_url):

    remote_file = urllib.request.urlopen(pdf_url).read()
    memory_file = io.BytesIO(remote_file)
    parser = PDFParser(memory_file)
    doc = PDFDocument()
    parser.set_document(doc)
    #Warning sometimes, error in pdf?
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    ret = []
    # Process each page contained in the document.
    for pageIdx, page in enumerate(doc.get_pages()):
        ret.append([])
        interpreter.process_page(page)
        layout = device.get_result()
        for idx, lt_obj in enumerate(layout):
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                if len(lt_obj.get_text().strip()) > 0:
                    ret[pageIdx].append((lt_obj.get_text().splitlines()))
    return ret
Exemple #18
0
	def __init__(self, filepath):
		self.doc = PDFDocument() # the underlying pdf document
		fp = open(filepath, 'rb')
		parser = PDFParser(fp)
		parser.set_document(self.doc)
		self.doc.set_parser(parser)
		self.doc.initialize()
def getData(fileName):
 doc = PDFDocument()
 fp = file(fileName, 'rb')
 parser = PDFParser(fp)
 try:
  parser.set_document(doc)
  doc.set_parser(parser)
 except:
  return "error"
   
 parser.close()
 fp.close()
 try:
  for xref in doc.xrefs:
   info_ref=xref.trailer.get('Info')
   if info_ref:
    info=resolve1(info_ref)
   metadata=info
   if metadata == None:
    return "Empty metadata"
   else:
    if metadata.has_key('Author'):
     print("Author "+metadata['Author'])
    if metadata.has_key('Company'):
     print("Company "+metadata['Company'])
    if metadata.has_key('Producer'):
     print("Producer "+metadata['Producer'])
    if metadata.has_key('Creator'):
     print("Creator "+metadata['Creator'])         
 except Exception,e:
  print "\t [x] Error in PDF extractor"
  return e 
Exemple #20
0
    def get_toc(self):
        fp = open(self.pdf, 'rb')
        parser = PDFParser(fp)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')
        
        # title
        if doc.info:
            metadict = doc.info[0]
            if 'Title' in metadict.keys():
                self.title = normalize_title(metadict['Title'])

        # level 1 of toc
        try:
            outlines = doc.get_outlines()
            toc = list()
            select_level = self.get_level1(outlines)
        except:
            return None
        for (level,title,dest,a,se) in doc.get_outlines():
            if level==select_level:
                toc.append(normalize_toc_item(title))
        return toc
Exemple #21
0
    def get_metadata(self):
        """Returns metadata from both
    	   the info field (older PDFs) and XMP (newer PDFs).
           Return format is a .modules.metadata.Metadata object
    	"""
        file_pointer = open(self.path, 'rb')
        parser = PDFParser(file_pointer)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize()
        metadata = Metadata()
        for i in doc.info:
            metadata.add(i)
        if 'Metadata' in doc.catalog:
            xmp_metadata = resolve1(doc.catalog['Metadata']).get_data()
            xmp_dict = xmp_to_dict(xmp_metadata)
            #Let's add only the most useful one
            if "xap" in xmp_dict:
                metadata.add(xmp_dict["xap"])
            if "pdf" in xmp_dict:
                metadata.add(xmp_dict["pdf"])
            if "dc" in xmp_dict:
                metadata.add(xmp_dict["dc"], metadataType="dc")
        file_pointer.close()

        self.metadata = metadata
        return metadata
Exemple #22
0
def dumpoutline(outfp, fname, objids, pagenos, password='',
                dumpall=False, codec=None):
    doc = PDFDocument()
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)
    pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
    for (level,title,dest,a,se) in doc.get_outlines():
        pageno = None
        if dest:
            dest = resolve1( doc.lookup_name('Dests', dest) )
            if isinstance(dest, dict):
                dest = dest['D']
            pageno = pages[dest[0].objid]
        elif a:
            action = a.resolve()
            if isinstance(action, dict):
                subtype = action.get('S')
                if subtype and repr(subtype) == '/GoTo' and action.get('D'):
                    dest = action['D']
                    pageno = pages[dest[0].objid]
        outfp.write(repr((level,title,dest,pageno))+'\n')
    parser.close()
    fp.close()
    return
Exemple #23
0
 def prepare(self):
     self.doc = PDFDocument()
     self.source = open(self.filename, "rb")
     parser = PDFParser(self.source)
     parser.set_document(self.doc)
     self.doc.set_parser(parser)
     self.doc.initialize("")
Exemple #24
0
def process_pdf(rsrcmgr, device, fp, pagenums=None, maxpages=100, password=''):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument()
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.set_parser(parser)
    # Supply the document password for initialization.
    # (If no password is set, give an empty string.)
    doc.initialize(password)
    # Check if the document allows text extraction. If not, abort.
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    pages = dict(enumerate(doc.get_pages()))
    for num, page in pages.iteritems():
        if pagenums and (num not in pagenums):
            continue
        interpreter.process_page(page)
        if maxpages and maxpages <= num + 1:
            break
    return pages
 def parse (self):
     fp = file(self.pdf, 'rb')
     parser = PDFParser(fp, dbg=self.debug)
     doc = PDFDocument(parser, dbg=self.debug)
     #extract blob of data after EOF (if it exists)
     if doc.found_eof and doc.eof_distance > 3:
         self.bin_blob = parser.read_from_end(doc.eof_distance)
     res = '<pdf>'
     visited = set() #keep track of the objects already visited
     for xref in doc.xrefs:
         for objid in xref.get_objids():
             if objid in visited:
                 continue
             if objid == 21 or objid == 67:
                 print objid
             visited.add(objid)
             try:
                 obj = doc.getobj(objid)
                 res += '<object id="' + str(objid) + '">\n'
                 res += self.dump(obj)
                 res += '\n</object>\n\n'
             except PDFObjectNotFound as e:
                 mal_obj = parser.read_n_from(xref.get_pos(objid)[1], 4096)
                 mal_obj = mal_obj.replace('<', '0x3C')
                 res += '<object id="%d" type="malformed">\n%s\n</object>\n\n' % (objid, mal_obj)
                 self.takenote(self.malformed, 'objects', objid)
             except Exception as e:
                 res += '<object id="%d" type="exception">\n%s\n</object>\n\n' % (objid, e.message)
     fp.close()
     res += self.dumptrailers(doc)
     res += '</pdf>'
     self.xml=res
     self.errors = doc.errors
     self.bytes_read = parser.BYTES
     return
Exemple #26
0
def pdf_isvalid(filelike):
    ''' returns True if valid pdf, else False
    @param filelike: filelike object, seekable
    '''
    logger = logging.getLogger()
    isvalid = False    
    filelike.seek(0)  
    
    if filelike.read(len(PDF_MAGIC)) != PDF_MAGIC:
        return False
    else:
        filelike.seek(0)
    try:
        parser = PDFParser(filelike)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')
        if doc.is_extractable:
            isvalid = True
    except PDFException as excobj:
        logger.warning("pdf has valid header but, still not valid pdf, exception was %r" %(excobj))
        isvalid = False
            
    filelike.seek(0)
    return isvalid
Exemple #27
0
def with_pdf (pdf_doc, pdf_pwd, fn, *args):
    """Open the pdf document, and apply the function, returning the results"""
    result = None
    try:
        # open the pdf file
        if hasattr(pdf_doc, 'read'):
            fp = pdf_doc
        else:
            fp = open(pdf_doc, 'rb')
        # create a parser object associated with the file object
        parser = PDFParser(fp)
        # create a PDFDocument object that stores the document structure
        doc = PDFDocument(parser)
        # connect the parser and document objects
        parser.set_document(doc)
        # supply the password for initialization
        #doc.initialize(pdf_pwd)

        if doc.is_extractable:
            # apply the function and return the result
            result = fn(doc, *args)

        # close the pdf file
        fp.close()
    except IOError:
        # the file doesn't exist or similar problem
        pass
        raise
    return result
Exemple #28
0
def convert_pdf_to_txt(path):

    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()

    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    with open(path, 'rb') as fp:
        parser = PDFParser(fp)

        doc = PDFDocument(caching=True)
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')

        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.

        for page in doc.get_pages():
            interpreter.process_page(page)
        text = retstr.getvalue()

    device.close()
    retstr.close()

    return text
Exemple #29
0
def extract_text_elements_from_pdf(path, j=nulljob):
    """Opens a PDF and extract every element that is text based (LTText).
    """
    fp = open(path, 'rb')
    doc = PDFDocument(caching=True)
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize()
    rsrcmgr = PDFResourceManager()
    laparams = LAParams(all_texts=True, paragraph_indent=5, heuristic_word_margin=True)
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = []
    all_elements = []
    enumerated_pages = list(enumerate(doc.get_pages()))
    progress_msg = "Reading page %i of %i"
    for pageno, page in j.iter_with_progress(enumerated_pages, progress_msg):
        interpreter.process_page(page)
        page_layout = device.get_result()
        pages.append(Page(page_layout.width, page_layout.height))
        textboxes = extract_textboxes(page_layout)
        elements = [create_element(box) for box in textboxes]
        merge_oneletter_elems(elements)
        for i, elem in enumerate(elements):
            elem.page = pageno
            elem.order = i
        all_elements += elements
    return pages, all_elements
Exemple #30
0
    def load( self, open_file ):
        self.fields = {}
        self.text= {}

        # Create a PDF parser object associated with the file object.
        parser = PDFParser(open_file)
        # Create a PDF document object that stores the document structure.
        doc = PDFDocument()
        # Connect the parser and document objects.
        parser.set_document(doc)
        doc.set_parser(parser)
        # Supply the password for initialization.
        # (If no password is set, give an empty string.)
        doc.initialize('')
        # Check if the document allows text extraction. If not, abort.
        if not doc.is_extractable:
            raise PDFTextExtractionNotAllowed
        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()
        # Set parameters for analysis.
        laparams = LAParams()
        # Create a PDF page aggregator object.
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # Process each page contained in the document.
        for pgnum, page in enumerate( doc.get_pages() ):
            interpreter.process_page(page)
            if page.annots:
                self._build_annotations( page )
            txt= self._get_text( device )
            self.text[pgnum+1]= txt
Exemple #31
0
    def parse_document(self):
        self.res = []  # result set
        self.media_boxes = dict()  # media coordinate dictionary
        self.n = 0  # page count
        pdf = open(self.pdf, "rb")
        pdf_parser = PDFParser(pdf)
        pdf_document = PDFDocument(pdf_parser)
        la_params = LAParams(detect_vertical=True)
        if constants.USE_CUSTOM_PDF_PARAMETERS:
            la_params = LAParams(detect_vertical=constants.DEFAULT_DETECT_VERTICAL,
                                 line_overlap=constants.DEFAULT_LINE_OVERLAP,
                                 line_margin=constants.DEFAULT_LINE_MARGIN,
                                 word_margin=constants.DEFAULT_WORD_MARGIN,
                                 char_margin=constants.DEFAULT_CHAR_MARGIN,
                                 boxes_flow=constants.DEFAULT_BOXES_FLOW)

        if pdf_document.is_extractable:
            resource_manager = PDFResourceManager()
            page_aggregator = PDFPageAggregator(resource_manager,
                                                laparams=la_params)
            page_interpreter = PDFPageInterpreter(resource_manager,
                                                  page_aggregator)
            pages = PDFPage.create_pages(pdf_document)

            for page in pages:
                page_interpreter.process_page(page)
                layout = page_aggregator.get_result()
                crop_box = page.cropbox
                page_box = page.mediabox
                self.media_boxes[self.n] = {"x0": crop_box[0], "y0": crop_box[1],
                                            "x1": crop_box[2], "y1": crop_box[3],
                                            "x0page": page_box[0], "y0page": page_box[1],
                                            "x1page": page_box[2], "y1page": page_box[3]}
                self.box_id = -1
                self.res = self.get_objects(layout._objs, self.res, self.n, self.media_boxes)
                self.n += 1

            return self.res, self.media_boxes
Exemple #32
0
def extract_pdf(path, languages=None):
    """
    Extract content from a PDF file.

    This will attempt to use pdfminer to extract textual content from
    each page. If none is found, it'll send the images through OCR.
    """
    with open(path, 'rb') as fh:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        parser = PDFParser(fh)
        doc = PDFDocument(parser, '')

        result = {'pages': []}
        if len(doc.info):
            for k, v in doc.info[-1].items():
                k = k.lower().strip()
                v = string_value(v)
                if k != 'pages' and v is not None and '<PDFObjRef:' not in v:
                    result[k] = string_value(v)

        for i, page in enumerate(PDFPage.create_pages(doc)):
            text = None
            try:
                interpreter.process_page(page)
                layout = device.get_result()
                text = _convert_page(layout, path)
            except Exception as ex:
                log.warning("Failed to parse PDF page: %r", ex)

            if text is None or len(text) < 3:
                log.info("OCR: %r, pg. %s", path, i + 1)
                text = _extract_image_page(path, i + 1, languages)
            result['pages'].append(text)
        device.close()
        return result
def get_total(filename):
    path = open(filename, 'rb')
    parser = PDFParser(path)
    document = PDFDocument(parser)
    temp_total = -1

    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        check_total = False

        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    results = str(x.get_text())
                    if "Subtotal for all regions" in results:
                        check_total = True
                    if check_total:
                        # print("results: " + results)
                        temp_results = re.search(r'(.*)\n', results,
                                                 re.M | re.I).group(1)
                        temp_results = temp_results.replace(" ", "").replace(
                            "\\n", "")
                        try:
                            temp_num = int(temp_results)
                            if temp_num > temp_total:
                                temp_total = temp_num
                        except ValueError:
                            continue

    return temp_total
Exemple #34
0
def extractembedded(outfp,
                    fname,
                    objids,
                    pagenos,
                    password='',
                    dumpall=False,
                    codec=None,
                    extractdir=None):
    def extract1(obj):
        filename = os.path.basename(obj['UF'] or obj['F'])
        fileref = obj['EF']['F']
        fileobj = doc.getobj(fileref.objid)
        if not isinstance(fileobj, PDFStream):
            raise PDFValueError(
                'unable to process PDF: reference for %r is not a PDFStream' %
                (filename))
        if fileobj.get('Type') is not LITERAL_EMBEDDEDFILE:
            raise PDFValueError(
                'unable to process PDF: reference for %r is not an EmbeddedFile'
                % (filename))
        path = os.path.join(extractdir, filename)
        if os.path.exists(path):
            raise IOError('file exists: %r' % path)
        print >> sys.stderr, 'extracting: %r' % path
        out = file(path, 'wb')
        out.write(fileobj.get_data())
        out.close()
        return

    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser, password)
    for xref in doc.xrefs:
        for objid in xref.get_objids():
            obj = doc.getobj(objid)
            if isinstance(obj, dict) and obj.get('Type') is LITERAL_FILESPEC:
                extract1(obj)
    return
Exemple #35
0
def pdfminer(f):

    # Open a PDF file.
    fp = open(f, 'rb')
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    #    device = PDFDevice(rsrcmgr)

    laparams = LAParams(all_texts=True)
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)

    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    converter = HTMLConverter(os.path.basename(f))

    # Process each page contained in the document.
    for page in PDFPage.create_pages(document):

        interpreter.process_page(page)

        layout = device.get_result()
        converter.current_page = page
        converter.render(layout)
        break  # stop after first page.

    converter.add_features()

    return converter
Exemple #36
0
    def generateFileContent(self):

        import tempfile
        import urllib

        abbreviationsPdfUrl = u"http://www.realacademiagalega.org/c/document_library/get_file?uuid=f29e6ce1-9ac5-42e3-8c15-73c4b9b5f48b&groupId=10157"
        temporaryFile = tempfile.NamedTemporaryFile()
        urllib.urlretrieve(abbreviationsPdfUrl, temporaryFile.name)

        entries = set()
        fileObject = open(temporaryFile.name, "rb")
        parser = PDFParser(fileObject)
        document = PDFDocument(parser)
        resourceManager = PDFResourceManager()
        device = PDFPageAggregator(resourceManager)
        interpreter = PDFPageInterpreter(resourceManager, device)
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
            objects = [
                object for object in layout if not isinstance(object, LTRect)
                and not isinstance(object, LTCurve)
            ]
            params = LAParams()
            for line in layout.group_objects(params, objects):
                text = line.get_text()
                if u":" in text:
                    entry = text.split(u":")[0]
                    entry = entry.strip()
                    entry = entry.replace(u"..", ".")
                    entries.add(entry)

        dictionary = u"# Abreviaturas empregadas no Dicionario da Real Academia Galega\n"
        dictionary += u"# http://www.realacademiagalega.org/abreviaturas\n"
        dictionary += u"\n"
        for entry in formatEntriesForDictionary(entries, u"abreviatura"):
            dictionary += entry
        return dictionary
Exemple #37
0
def ParsePDF():
    filename = open(pdfpath, 'rb')  #以二进制读模式打开
    #用文件对象来创建一个pdf文档分析器
    parser = PDFParser(filename)
    # 创建一个PDF文档对象存储文档结构,提供密码初始化,没有就不用传该参数
    doc = PDFDocument(parser, password='')
    #检查文件是否允许文本提取
    if not doc.is_extractable:
        print("Not Allowd Extractable")
        raise PDFTextExtractionNotAllowed
    
    # 创建PDf 资源管理器来管理共享资源,#caching = False不缓存
    rsrcmgr = PDFResourceManager(caching = False)
    # 创建一个PDF设备对象
    laparams = LAParams()
    # 创建一个PDF页面聚合对象
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    #device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=laparams)
    # 创建一个PDF解析器对象
    interpreter = PDFPageInterpreter(rsrcmgr, device)  

    # 获取page列表list对象,
    # print(PDFPage.get_pages(doc))

    #获取page列表循环遍历列表,每次处理一个page的内容
    for page in PDFPage.create_pages(doc):
        # 接受该页面的LTPage对象
        interpreter.process_page(page)
        # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象
        layout = device.get_result()
        for i in layout:
            if hasattr(i,"get_text") :
                content = i.get_text().replace(u'\xa0',u'').replace('\n','')
                document.add_paragraph(content , style=None)
        break    
    document.save("a.docx")
    filename.close()
    return 1
    def parse_question_file(question_file_path):
        text_content = []
        with open(question_file_path, 'rb') as question_file:
            parser = PDFParser(question_file)
            document = PDFDocument(parser)

            if not document.is_extractable:
                raise PDFTextExtractionNotAllowed
            else:
                rsrcmgr = PDFResourceManager()
                laparams = LAParams()
                device = PDFPageAggregator(rsrcmgr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)

                for page in PDFPage.create_pages(document):
                    interpreter.process_page(page)
                    layout = device.get_result()
                    for x in layout:
                        if isinstance(x, LTTextBoxHorizontal):
                            line = x.get_text().decode().strip()
                            if line:
                                text_content.append(line + '\n')
        return text_content
Exemple #39
0
def get_blurb():
    pdfs = glob.glob('/pdfs/*')
    if not pdfs:
        print >> sys.stderr, 'NO PDFS'
        return '', ''
    pdf = random.choice(pdfs)
    print >> sys.stderr, 'pdf:', pdf
    with open(pdf, 'rb') as f:
        parser = PDFParser(f)
        document = PDFDocument(parser)
        assert document.is_extractable
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        device = TextConverter(rsrcmgr,
                               retstr,
                               codec='utf-8',
                               laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        pages = list(PDFPage.get_pages(f))
        pnum = random.randint(0, len(pages))
        interpreter.process_page(pages[pnum])
        txt = retstr.getvalue()
    return pdf.replace('pdfs', 'view') + '#page=' + str(pnum), txt[:100]
Exemple #40
0
def extract_text(in_path, out_path):
    #https://towardsdatascience.com/pdf-text-extraction-in-python-5b6ab9e92dd
    files = glob.glob(in_path + '*.pdf')

    for i in range(len(files)):
        print(str(i / len(files) * 100)[:4] + "%", end="\r")
        name = files[i]
        file_path = in_path + name
        output_string = StringIO()

        with open(file_path, 'rb') as infile:
            parser = PDFParser(infile)
            doc = PDFDocument(parser)
            rsrcmgr = PDFResourceManager()
            device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.create_pages(doc):
                interpreter.process_page(page)

        out_filename = out_path + os.path.basename(name).replace("pdf", "txt")

        with open(out_filename, 'w') as outfile:
            outfile.write(output_string.getvalue())
Exemple #41
0
def extract_pages(fp, start=None, end=None):
    """ extracts LTPage objects from a pdf file
    
    slightly modified from: https://euske.github.io/pdfminer/programming.html
    """
    laparams = LAParams()

    parser = PDFParser(fp)
    document = PDFDocument(parser)

    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    manager = PDFResourceManager()
    device = PDFPageAggregator(manager, laparams=laparams)
    interpreter = PDFPageInterpreter(manager, device)

    for i, page in enumerate(PDFPage.create_pages(document)):
        if start is not None and end is not None and i < start or i >= end:
            continue

        interpreter.process_page(page)
        yield device.get_result()
Exemple #42
0
def get_problem_page(problem, pdf_path):
    """
    Returns the pdf object belonging to the page of a problem widget

    Parameters
    ----------
    problem : Problem
        Problem object in the database of the currently selected problem
    pdf_path : str
        Path to the PDF file of the exam for this problem

    Returns
    -------
    page : PDFPage
        PDFPage object with information about the current page
    """
    fp = open(pdf_path, 'rb')

    parser = PDFParser(fp)
    document = PDFDocument(parser)

    page_number = problem.widget.page
    return next(itertools.islice(PDFPage.create_pages(document), page_number, page_number + 1))
Exemple #43
0
def parse(Path):
    parser = PDFParser(Path) #parser的意思是解析器、分析程序
    document = PDFDocument(parser)
    re_list = []

    # 判断PDF是否能够解析
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    results = x.get_text()
                    re_list.append(results)
    print(re_list)
    return re_list
Exemple #44
0
def upload_file():
    if request.method == 'POST':
        output_string = StringIO()
        in_file = request.files["file"]
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
        texto = output_string.getvalue()
        # Gerando mp3
        tts = gtts.gTTS(texto, lang="pt-br")
        mp3_fp = BytesIO()
        # tts.write_to_fp(mp3_fp)
        tts.save('mp3_fp.mp3')
        response = jsonify(message="Simple server is running")
        response.headers.add("Access-Control-Allow-Origin", "*")

        return send_file('mp3_fp.mp3', as_attachment=True)
    else:
        return "Method POST not found"
Exemple #45
0
def extract_layout_by_page(pdf_path):
    """
    Extracts LTPage objects from a pdf file.
    """
    laparams = LAParams()

    fp = open(pdf_path, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser)

    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    rsrcmgr = PDFResourceManager()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    layouts = []
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        layouts.append(device.get_result())

    return layouts
Exemple #46
0
    def parse_pdf(self, source_pdf: str = None) -> None:
        """Parse source PDF into entities which can be
        used for text searches for example.

        :param source_pdf: source
        """
        if source_pdf is not None:
            self.switch_to_pdf_document(source_pdf)
        source_parser = PDFParser(self.active_fileobject)
        source_document = PDFDocument(source_parser)
        source_pages = PDFPage.create_pages(source_document)
        rsrcmgr = PDFResourceManager()
        laparams = LAParams(
            detect_vertical=True,
            all_texts=True,
        )
        device = RPAConverter(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # # Look at all (nested) objects on each page
        for _, page in enumerate(source_pages, 0):
            interpreter.process_page(page)
        self.rpa_pdf_document = device.close()
Exemple #47
0
def main(fname):
    with open(fname, 'rb') as fd:
        parser = PDFParser(fd)
        doc = PDFDocument(parser)

        # Check if document is extractable, if not abort
        if not doc.is_extractable:
            raise Exception

        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        all_txt = ""
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
            layout = device.get_result()
            txt = parse_layout(layout)
            all_txt += txt

        #print "Converted text\n", all_txt
        snip = find_pattern(all_txt, "volunteer recycling", 200)
        print snip
Exemple #48
0
 def __init__(self, stream, pages=None, laparams=None, precision=0.001):
     self.laparams = None if laparams == None else LAParams(**laparams)
     self.stream = stream
     self.pages_to_parse = pages
     self.precision = precision
     rsrcmgr = PDFResourceManager()
     self.doc = PDFDocument(PDFParser(stream))
     self.metadata = {}
     for info in self.doc.info:
         self.metadata.update(info)
     for k, v in self.metadata.items():
         if hasattr(v, "resolve"):
             v = v.resolve()
         if type(v) == list:
             self.metadata[k] = list(map(decode_text, v))
         elif isinstance(v, PSLiteral):
             self.metadata[k] = decode_text(v.name)
         elif isinstance(v, bool):
             self.metadata[k] = v
         else:
             self.metadata[k] = decode_text(v)
     self.device = PDFPageAggregator(rsrcmgr, laparams=self.laparams)
     self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
Exemple #49
0
def Pdf2Txt(DataIO,Save_path):                     #来创建一个pdf文档分析器
    parser = PDFParser(DataIO)                     #创建一个PDF文档对象存储文档结构
    document = PDFDocument(parser) 
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        #创建一个PDF资源管理器对象来存储共赏资源
        rsrcmgr=PDFResourceManager();            #设定参数进行分析
        laparams=LAParams();                    #创建一个PDF设备对象
        #device=PDFDevice(rsrcmgr)
        device=PDFPageAggregator(rsrcmgr,laparams=laparams);#创建一个PDF解释器对象
        interpreter=PDFPageInterpreter(rsrcmgr,device)
        #处理每一页
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page);        #接受该页面的LTPage对象
            layout=device.get_result()
            for x in layout:
                try:
                    if(isinstance(x,LTTextBoxHorizontal)):
                        with open('%s'%(Save_path),'a') as f:
                            f.write(x.get_text().encode('utf-8')+'\n')
                except:
                    print "Failed!"
Exemple #50
0
def shan_convert(pdf_path):
    fp = open(pdf_path, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser)
    if not document.is_extractable:
        temp_file = pikepdf.open(pdf_path)
        temp_path = pdf_path[:-4] + "shan_temp" + ".pdf"
        temp_file.save(temp_path)
        fp = open(temp_path, 'rb')
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pagenos = set()
    for page in PDFPage.get_pages(fp, pagenos):
        interpreter.process_page(page)
    text = retstr.getvalue()
    fp.close()
    device.close()
    retstr.close()
    return text
Exemple #51
0
    def character_extraction(self, address):
        # Create a file pointer
        fp = open(address, 'rb')

        try:
            # Create parser object to parse the pdf content
            parser = PDFParser(fp)

            # Store the parsed content in PDFDocument object
            document = PDFDocument(parser, '')

            # Create PDFResourceManager object that stores shared resources such as fonts or images
            rsrcmgr = PDFResourceManager()

            # set parameters for analysis
            laparams = LAParams()

            # Create a PDFDevice object which translates interpreted information into desired format
            # Device needs to be connected to resource manager to store shared resources
            # device = PDFDevice(rsrcmgr)
            # Extract the decive to page aggregator to get LT object elements
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)

            # Create interpreter object to process page content from PDFDocument
            # Interpreter needs to be connected to resource manager for shared resources and device
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.create_pages(document):
                # As the interpreter processes the page stored in PDFDocument object
                interpreter.process_page(page)
                # The device renders the layout from interpreter
                layout = device.get_result()
                # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
                for lt_obj in layout:
                    if isinstance(lt_obj, (LTTextBox, LTTextLine)):
                        self.fetch_chars(lt_obj)
        finally:
            fp.close()
Exemple #52
0
    def test_pdf(self):
        # Test capture library API
        content = self.capture.pdf(url=server.base_url + self.url)
        self.check_pdf(content)

        # Test service: relative and absolute URLs
        for url in (server.base_url + self.url, '..' + self.url, self.url):
            result = self.fetch(self.src, params={'url': url})
            self.check_filename(result, 'screenshot.pdf')
            self.check_pdf(result.content)

        # delay=. After 500ms, page changes text and color to blue
        # file=.  Changes filename
        result = self.fetch(self.src, params={'url': self.url, 'delay': 600, 'file': 'delay'})
        self.check_filename(result, 'delay.pdf')
        self.assertIn('Blueblock', normalize(get_text(result.content)))

        # --format and --orientation
        result = self.fetch(self.src, params={
            'url': self.url, 'format': 'A3', 'orientation': 'landscape'})
        parser = PDFParser(io.BytesIO(result.content))
        page = next(PDFPage.create_pages(PDFDocument(parser)))
        self.assertIn([round(x) for x in page.attrs['MediaBox']], (
            [0, 0, 1188, 842],      # noqa: Chrome uses 1188 x 842 for A3
            [0, 0, 1191, 842],      # noqa: PhantomJS uses 1191 x 842 for A3
        ))

        # cookie=. The Cookie is printed on the screen via JS
        result = self.fetch(self.src, params={'url': self.url + '?show-cookie', 'cookie': 'a=x'})
        self.assertIn('a=x', normalize(get_text(result.content)))
        # Cookie: header is the same as ?cookie=.
        # Old request cookies vanish. Only new ones remain
        result = self.fetch(self.src, params={'url': self.url + '?show-cookie'},
                            headers={'Cookie': 'b=z'})
        result_text = normalize(get_text(result.content))
        self.assertIn('js:cookie=b=z', result_text)
        self.assertIn('server:cookie=b=z', result_text)
Exemple #53
0
def parse_file(file: Path):
    with open(file, 'rb') as fp:
        parser = PDFParser(fp)
        doc = PDFDocument(parser)
        laparams = LAParams()
        text_boxes = []  # 清理后box列表

        if not doc.is_extractable:
            raise PDFTextExtractionNotAllowed

        rsrcmgr = PDFResourceManager()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        last_out = None
        for i, page in enumerate(PDFPage.create_pages(doc)):
            orgi_boxes = []  # 原始box列表

            interpreter.process_page(page)
            layout = device.get_result()

            for out in layout:
                if isinstance(out,
                              LTTextBoxHorizontal) and ('猿题库'
                                                        not in out.get_text()):
                    orgi_boxes.append(out)
                else:
                    pass
            # 去除页眉和页尾
            cleaned_boxes = orgi_boxes[1:-1]

            for box in cleaned_boxes:
                pdf_box = PDFBox(box, i, last_out)
                text_boxes.append(pdf_box)
                last_out = pdf_box
    print('parse end')
    return text_boxes
Exemple #54
0
def get_result_from_file(filename):
    from pdfminer.pdfparser import PDFParser
    from pdfminer.pdfdocument import PDFDocument
    from pdfminer.pdfpage import PDFPage
    from pdfminer.pdfpage import PDFTextExtractionNotAllowed
    from pdfminer.pdfinterp import PDFResourceManager
    from pdfminer.pdfinterp import PDFPageInterpreter
    from pdfminer.converter import PDFPageAggregator
    from pdfminer.layout import LAParams

    result = {"filename": filename, "pages": []}
    fp = open(filename, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser)
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    laparams.char_margin = 2.0
    laparams.detect_vertical = True
    laparams.line_margin = 1.0
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    page_index = 0
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        layout = device.get_result()
        bounding_box = get_bounding_box(layout)
        labels = get_text_labels(layout)
        result["pages"].append({
            "index": page_index,
            "bounding_box": bounding_box,
            "labels": labels
        })
        page_index += 1
    fp.close()
    return result
Exemple #55
0
    def pdf2txt(self, path):
        print('解析pdf中...')
        with open(path, 'rb') as f:
            praser = PDFParser(f)

            doc = PDFDocument(praser)

            # if not doc.is_extractable:
            #     raise PDFTextExtractionNotAllowed

            pdfrm = PDFResourceManager()

            laparams = LAParams()

            device = PDFPageAggregator(pdfrm, laparams=laparams)

            interpreter = PDFPageInterpreter(pdfrm, device)
            result = ''
            for page in PDFPage.create_pages(doc):
                interpreter.process_page(page)
                layout = device.get_result()
                for x in layout:
                    try:
                        if hasattr(x, "get_text"):
                            content = x.get_text()
                            with open(
                                    r'E:\pycharm_len\py_learn\learn\office\file\linux_pdf.txt',
                                    'a') as f:
                                try:
                                    result += content
                                    f.write(content)
                                except Exception as err:
                                    print('error_write', err)
                    except Exception as err:
                        print('error', err)
            print('__________' * 10)
            print(result)
def pdf2txt(filePath, outPath):
    manager = PDFResourceManager()
    codec = 'utf-8'
    caching = True
    #创建一个pdf文档分析器,从文件中获取数据
    parser = PDFParser(filePath)
    #创建一个PDF文档对象存储文档结构,保存获取的数据
    document = PDFDocument(parser)
    # 检查文件是否允许文本提取
    if not document.is_extractable:
        #print("sorry,failed")
        raise PDFTextExtractionNotAllowed
    else:
        # 创建一个PDF资源管理器对象来存储共享资源
        rsrcmgr = PDFResourceManager()
        # 设定参数进行分析
        laparams = LAParams()
        # 创建一个PDF设备对象
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释器对象,处理页面内容
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # 处理文档中的每一页
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            # 接受该页面的LTPage整个页面对象
            layout = device.get_result()

            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open('%s' % (outPath), 'a') as f:
                        #“a”追加写,不会被覆盖;“w”重新写入,w有些文献会出错
                        #f.write(x.get_text()+ '\n')
                        f.write((
                            x.get_text().encode("utf-8") +
                            '\n'.encode("utf-8")).decode(
                                "utf-8",
                                "xmlcharrefreplace"))  #decode("gbk","ignore"))
Exemple #57
0
def process_pdf(title, path):
    """
    @param title string Title to apply to the document.
    @param path string Path to the input PDF.
    @returns DrocerDocument
    """
    output_document = DrocerDocument(title, path)
    with open(path, 'rb') as pdf_file:
        # setup pdf reader
        pdf_parser = PDFParser(pdf_file)
        pdf_password = ''
        pdf_document = PDFDocument(pdf_parser, pdf_password)
        pdf_rsrcmgr = PDFResourceManager()
        pdf_laparams = LAParams()
        pdf_device = PDFPageAggregator(pdf_rsrcmgr, laparams=pdf_laparams)
        pdf_interpreter = PDFPageInterpreter(pdf_rsrcmgr, pdf_device)
        # process document
        page_number = 0
        for pdf_page in PDFPage.create_pages(pdf_document):
            page_number += 1
            logger.info("processing %s page number %s" % (title, page_number))
            output_page = DrocerPage(page_number)
            pdf_interpreter.process_page(pdf_page)
            pdf_layout = pdf_device.get_result()
            box_number = 0
            for pdf_obj in pdf_layout:
                if isinstance(pdf_obj, LTTextBox):
                    box_number += 1
                    output_box = DrocerBox(page_number, box_number, pdf_obj.x0,
                                           pdf_obj.y0, pdf_obj.x1, pdf_obj.y1,
                                           pdf_obj.get_text().encode('utf8'))
                    output_page.boxes.append(output_box)
                else:
                    #logger.debug("non-text object")
                    pass
            output_document.pages.append(output_page)
    return output_document
Exemple #58
0
def process_attachment(name: str, data: bytes) -> str:
    result = ""
    if name.endswith(".txt"):
        try:
            result = data.decode("utf-8")
        except UnicodeDecodeError:
            print("unable to decode the given text by 'utf-8'")
    else:
        temp_file_path = "./data/temp"
        with open(temp_file_path, mode='wb') as temp:
            temp.write(data)
        if name.endswith(".docx"):
            result = docx2txt.process(temp_file_path)
        elif name.endswith(".pdf"):
            output_string = StringIO()
            with open(temp_file_path, mode='rb') as pdf:
                parser = PDFParser(pdf)
                doc = PDFDocument(parser)
                resource_manager = PDFResourceManager()
                device = TextConverter(resource_manager, output_string, laparams=LAParams())
                interpreter = PDFPageInterpreter(resource_manager, device)
                for page in PDFPage.create_pages(doc):
                    interpreter.process_page(page)
            result = output_string.getvalue()
        elif name.endswith(".pptx"):
            ppt = Presentation(temp_file_path)
            for slide in ppt.slides:
                for shape in slide.shapes:
                    if hasattr(shape, "text"):
                        result += shape.text
        elif name.endswith(".xlsx"):
            data = pd.ExcelFile(temp_file_path)
            for sheet in data.sheet_names:
                temp = data.parse(sheet)
                result += str(temp.columns)
            result += str(data.sheet_names)
    return result
Exemple #59
0
    def get_pdf_metadata(self, pdf_file_stream):
        metadata = {
            'author': 'UNKNOWN_AUTHOR',
            'title': 'UNKNOWN_TITLE',
            'year': 'UNKNOWN_YEAR'
        }

        pdf_parser = PDFParser(pdf_file_stream)
        pdf_doc = PDFDocument(pdf_parser)

        author = make_pdf_metadata_str(pdf_doc.info[0]['Author'])
        if author and author != '':
            metadata['author'] = author

        title = make_pdf_metadata_str(pdf_doc.info[0]['Title'])
        if title and title != '':
            metadata['title'] = title

        year = pdf_metadata_moddate_to_year(
            make_pdf_metadata_str(pdf_doc.info[0]['ModDate']))
        if year and year != '':
            metadata['year'] = year

        return metadata
Exemple #60
0
def get_page_num(fpath):
    """ Get the page number for the current pdf file
    https://stackoverflow.com/questions/45841012/how-can-i-get-the-total-count-of-total-pages-of-a-pdf-using-pdfminer-in-python
    """
    tmp_path = get_tmp_path(fpath)
    cache_path = "{}.page_num.json".format(tmp_path)
    if os.path.isfile(cache_path):
        tmp_dict = load_general(cache_path)
        return tmp_dict['page_num']

    # Open a PDF file.
    fp = open(fpath, 'rb')
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser)

    c = resolve1(document.catalog['Pages'])['Count']

    tmp_dict = {'page_num': c}
    dump_general(tmp_dict, cache_path)

    return c