コード例 #1
1
def getPageLayouts(f1):
    '''Takes a pdf file object, f1, extracts the text-like objects, and returns'''
    try:
        '''The parser and doc pair for a "pipe" of sorts'''
        with open(fpath, 'rb') as f1:
            parser = PDFParser(f1)
            doc = PDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
            doc.initialize(pss_wd)

            # can we extract text?
            if doc.is_extractable:
                rsrcmgr = PDFResourceManager()
                laparams = LAParams()
                device = PDFPageAggregator(rsrcmgr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)

                page_layouts = []
                for page in doc.get_pages():
                    '''
                    I *think* we're actually calling on fp here, and not some
                    stored data; the idea is that .pdf files are "too big and
                    complicated" to load all at once, so why not just parse
                    what you need when you need it?
                    '''
                    interpreter.process_page(page)
                    # receive the LTPage object for the page
                    page_layouts.append(device.get_result())
    except IOError:
        raise IOError, "issue with loading file, please try again"
    finally:
        f1.close()
        return page_layouts
コード例 #2
0
ファイル: PDF_Parser.py プロジェクト: samdavey/Random
    def load( self, open_file ):
        self.fields = {}
        self.text= {}

        # Create a PDF parser object associated with the file object.
        parser = PDFParser(open_file)
        # Create a PDF document object that stores the document structure.
        doc = PDFDocument()
        # Connect the parser and document objects.
        parser.set_document(doc)
        doc.set_parser(parser)
        # Supply the password for initialization.
        # (If no password is set, give an empty string.)
        doc.initialize('')
        # Check if the document allows text extraction. If not, abort.
        if not doc.is_extractable:
            raise PDFTextExtractionNotAllowed
        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()
        # Set parameters for analysis.
        laparams = LAParams()
        # Create a PDF page aggregator object.
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # Process each page contained in the document.
        for pgnum, page in enumerate( doc.get_pages() ):
            interpreter.process_page(page)
            if page.annots:
                self._build_annotations( page )
            txt= self._get_text( device )
            self.text[pgnum+1]= txt
コード例 #3
0
    def create_pages(self):
        """Apply parsing function, returning the results"""

        from public_project.models import Page
        # create a parser object associated with the file object
        parser = PDFParser(self.pdf_file)
        # create a PDFDocument object that stores the document structure
        doc = PDFDocument()
        # connect the parser and document objects
        parser.set_document(doc)
        doc.set_parser(parser)
        # supply the password for initialization
        pdf_pwd = ''
        doc.initialize(pdf_pwd)

        if doc.is_extractable:
            # apply the function and return the result
            doc_pages = self._parse_pages(doc)

        i = 1
        for doc_page in doc_pages:
            page = Page(
                document=self.document,
                number=i,
                content = smart_unicode(doc_page, encoding='utf-8', strings_only=False, errors='strict'),
            )
            page.save()
            i = i + 1
コード例 #4
0
ファイル: PdfParser.py プロジェクト: hcouch21/styloproject
    def parse(self, path):
		out = StringIO.StringIO()
		fp = None
        # Directory
		if os.path.isdir(path):
			raise NotImplementedError()
        # File
	       	else:
			fp = file(path)		
		rsrc = PDFResourceManager()
		codec = 'utf-8'
		laparams = LAParams()
		laparams.char_margin = 2.0
		laparams.line_margin = 2.0
		laparams.word_margin = 0.0
		device = TextConverter(rsrc, out, codec=codec, laparams=laparams)
		doc = PDFDocument()
		parser = PDFParser(fp)
		parser.set_document(doc)
		doc.set_parser(parser)
		doc.initialize()
		interpreter = PDFPageInterpreter(rsrc, device)
		for page in doc.get_pages():
			interpreter.process_page(page)
		device.close()
		sample = Sample(path, None, out.getvalue())
		out.close()
		return sample
コード例 #5
0
ファイル: MyPdfMiner.py プロジェクト: i11uminator/bookservice
 def WithPdf(self, pdfdoc, password, fn, *args):
     """Open the pdf document, and apply the function, returning the results"""
     result = None
     try:
         # open the pdf file
         fp = open(pdfdoc, 'rb')
         # create a parser object associated with the file object
         parser = PDFParser(fp)
         # create a PDFDocument object that stores the document structure
         doc = PDFDocument()
         # connect the parser and document objects
         parser.set_document(doc)
         doc.set_parser(parser)
         # supply the password for initialization
         if password:
             self.password = password
         doc.initialize(self.password)
 
         if doc.is_extractable:
             # apply the function and return the result
             result = fn(doc, *args)
 
         # close the pdf file
         fp.close()
     except IOError:
         # the file doesn't exist or similar problem
         pass
     return result
コード例 #6
0
ファイル: dumppdf.py プロジェクト: Adniel/ComparePdf
def dumppdf(outfp, fname, objids, pagenos, password='',
            dumpall=False, codec=None):
    doc = PDFDocument()
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)
    if objids:
        for objid in objids:
            obj = doc.getobj(objid)
            dumpxml(outfp, obj, codec=codec)
    if pagenos:
        for (pageno,page) in enumerate(doc.get_pages()):
            if pageno in pagenos:
                if codec:
                    for obj in page.contents:
                        obj = stream_value(obj)
                        dumpxml(outfp, obj, codec=codec)
                else:
                    dumpxml(outfp, page.attrs)
    if dumpall:
        dumpallobjs(outfp, doc, codec=codec)
    if (not objids) and (not pagenos) and (not dumpall):
        dumptrailers(outfp, doc)
    fp.close()
    if codec not in ('raw','binary'):
        outfp.write('\n')
    return
コード例 #7
0
ファイル: pdftotext.py プロジェクト: mayhewsw/projects
def pdf_to_text(filename):
    from cStringIO import StringIO  
    from pdfminer.converter import LTChar, TextConverter    #<-- changed
    from pdfminer.layout import LAParams
    from pdfminer.pdfparser import PDFDocument, PDFParser
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter

    rsrc = PDFResourceManager()
    outfp = StringIO()
    device = TextConverter(rsrc, outfp, codec="utf-8", laparams=LAParams()) 

    doc = PDFDocument()
    fp = open(filename, 'rb')
    parser = PDFParser(fp)       
    parser.set_document(doc)     
    doc.set_parser(parser)       
    doc.initialize('')

    interpreter = PDFPageInterpreter(rsrc, device)

    print "There are: " + str(len(list(doc.get_pages()))) + " pages"

    for i, page in enumerate(doc.get_pages()):
        outfp.write("START PAGE %d\n" % i)
        if page is not None:
            interpreter.process_page(page)
        outfp.write("END PAGE %d\n" % i)

    device.close()
    fp.close()

    return outfp.getvalue()
コード例 #8
0
ファイル: metadataPDF.py プロジェクト: TechByTom/metagoofil
	def getData(self):
		doc = PDFDocument()
		fp = file(self.fname, 'rb')
		parser = PDFParser(fp)
		try:
			parser.set_document(doc)
			doc.set_parser(parser)
			doc.initialize(self.password)
		except:
			return "error"
		
		parser.close()
		fp.close()
		#try:
		#	metadata = resolve1(doc.catalog['Metadata'])
		#	return "ok"
		#except:
		#	print "[x] Error in PDF extractor, Metadata catalog"
		try:
			for xref in doc.xrefs:
				info_ref=xref.trailer.get('Info')
				if info_ref:
					info=resolve1(info_ref)
				self.metadata=info
				self.raw = info
			if self.raw == None:
				return "Empty metadata"
			else:
				return "ok"
		except Exception,e:
			return e 
			print "\t [x] Error in PDF extractor, Trailer Info"
コード例 #9
0
ファイル: pdf2text.py プロジェクト: zaim/bukutip
def process_pdf(rsrcmgr, device, fp, pagenums=None, maxpages=100, password=''):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument()
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.set_parser(parser)
    # Supply the document password for initialization.
    # (If no password is set, give an empty string.)
    doc.initialize(password)
    # Check if the document allows text extraction. If not, abort.
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    pages = dict(enumerate(doc.get_pages()))
    for num, page in pages.iteritems():
        if pagenums and (num not in pagenums):
            continue
        interpreter.process_page(page)
        if maxpages and maxpages <= num + 1:
            break
    return pages
コード例 #10
0
ファイル: pdfmeta.py プロジェクト: kristerhedfors/bin
def get_pdf_metadata(fileOrUrl, textmode=False, prefix='', basicauth=None):
    if len(args) > 1:
        prefix = fileOrUrl + ':'
    fp = None
    if fileOrUrl.startswith('http://') or fileOrUrl.startswith('https://'):
        request = urllib2.Request(fileOrUrl)
        if basicauth:
            request.add_header('Authorization', 'Basic ' + basicauth)
        fobj = urllib2.urlopen(request)
        pdfdata = fobj.read()
        fobj.close()
        fp = StringIO.StringIO(pdfdata)
    else:
        fp = open(fileOrUrl, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize()
    fp.close()
    if textmode:
        for obj in doc.info:
            for (name, val) in obj.iteritems():
                print '{0}:{1}={2}'.format(
                    fileOrUrl, name, val
                )
    else:
        val = doc.info
        if type(val) is list and len(val) == 1:
            val = val[0]
        print prefix + str(val)
コード例 #11
0
ファイル: MyPdfMiner.py プロジェクト: i11uminator/bookservice
 def ParseAllPages(self, filepath):
     # Open a PDF file.
     self.filepath = filepath
     fp = open(filepath, 'rb')
     # Create a PDF parser object associated with the file object.
     parser = PDFParser(fp)
     # Create a PDF document object that stores the document structure.
     doc = PDFDocument()
     # Connect the parser and document objects.
     parser.set_document(doc)
     doc.set_parser(parser)
     # Supply the password for initialization.
     # (If no password is set, give an empty string.)
     password = ""
     doc.initialize(password)
     # Check if the document allows text extraction. If not, abort.
     if not doc.is_extractable:
         raise PDFTextExtractionNotAllowed
     # Create a PDF resource manager object that stores shared resources.
     rsrcmgr = PDFResourceManager()
     # Create a PDF device object.
     device = PDFDevice(rsrcmgr)
     # Create a PDF interpreter object.
     interpreter = PDFPageInterpreter(rsrcmgr, device)
     # Process each page contained in the document.
     for page in doc.get_pages():
         interpreter.process_page(page)
コード例 #12
0
ファイル: statement2csv.py プロジェクト: jlas/misc
def pdf_to_csv(filename):
    # ... the following part of the code is a remix of the
    # convert() function in the pdfminer/tools/pdf2text module
    rsrc = PDFResourceManager()
    outfp = StringIO()
    device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())
    # becuase my test documents are utf-8 (note: utf-8 is the default codec)

    doc = PDFDocument()
    fp = open(filename, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')

    interpreter = PDFPageInterpreter(rsrc, device)
    for i, page in enumerate(doc.get_pages()):
        outfp.write("START PAGE %d\n" % i)
        if page is not None:
            interpreter.process_page(page)
        outfp.write("END PAGE %d\n" % i)

    device.close()
    fp.close()

    return outfp.getvalue()
コード例 #13
0
ファイル: pdfInvoiceMiner.py プロジェクト: vinovator/Vinlab
def read_invoice_pdfminer3k(pdfFile):
    fp = open(os.path.join(invoice_path + "\\" + pdfFile), "rb")

    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)

    doc.initialize("")
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()

    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Process each page contained in the document.
    invoice_text = ""
    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                invoice_text += lt_obj.get_text()

    # Extract client info from the string extracted from pdf
    client = extract_info(invoice_text, client_start, client_end)
    print("client :" + client)

    # Extract invoice no from the pdf file name
    invoice_no = extract_info(str(pdfFile), invoice_start, invoice_end)
    print("invoice no :" + invoice_no)

    # Pass the client info and invoice no to the method which writes to excel file
    write_excel(client, invoice_no)
コード例 #14
0
ファイル: dumppdf.py プロジェクト: joshmgrant/pdfminer
def dumpoutline(outfp, fname, objids, pagenos, password='',
                dumpall=False, codec=None):
    doc = PDFDocument()
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)
    pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
    for (level,title,dest,a,se) in doc.get_outlines():
        pageno = None
        if dest:
            dest = resolve1( doc.lookup_name('Dests', dest) )
            if isinstance(dest, dict):
                dest = dest['D']
            pageno = pages[dest[0].objid]
        elif a:
            action = a.resolve()
            if isinstance(action, dict):
                subtype = action.get('S')
                if subtype and repr(subtype) == '/GoTo' and action.get('D'):
                    dest = action['D']
                    pageno = pages[dest[0].objid]
        outfp.write(repr((level,title,dest,pageno))+'\n')
    parser.close()
    fp.close()
    return
コード例 #15
0
def getPDFMetadata(path):

    result = {}

    fp = open(path, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize()

    result = doc.info

    if 'Metadata' in doc.catalog:
        metadata = resolve1(doc.catalog['Metadata']).get_data()
        
        try:
            result.update( metadata ) # The raw XMP metadata
            
        except:
            pass
            
        try:
            result.update( xmp_to_dict(metadata) )
            
        except:
            pass

    return result[0]
コード例 #16
0
ファイル: book_parser.py プロジェクト: ArcainOne/anathema
    def pdf_function(pdf_doc, password='', *args, **kwargs):
        result = None
        try:
            # open the pdf file
            fp = open(pdf_doc, 'rb')
            # create a parser object associated with the file object
            parser = PDFParser(fp)
            # create a PDFDocument object that stores the document structure
            doc = PDFDocument()
            # connect the parser and document objects
            parser.set_document(doc)
            doc.set_parser(parser)
            # supply the password for initialization
            doc.initialize(password)

            if doc.is_extractable:
                # apply the function and return the result
                result = function(doc, *args, **kwargs)

            # close the pdf file
            fp.close()
        except IOError:
            # the file doesn't exist or similar problem
            pass
        return result
コード例 #17
0
ファイル: autosumpdf.py プロジェクト: suriyan/autosum
def convert_pdf_to_txt(path):

    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()

    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    with open(path, 'rb') as fp:
        parser = PDFParser(fp)

        doc = PDFDocument(caching=True)
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')

        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.

        for page in doc.get_pages():
            interpreter.process_page(page)
        text = retstr.getvalue()

    device.close()
    retstr.close()

    return text
コード例 #18
0
ファイル: pdfutils.py プロジェクト: emulbreh/ecs
def pdf_isvalid(filelike):
    ''' returns True if valid pdf, else False
    @param filelike: filelike object, seekable
    '''
    logger = logging.getLogger()
    isvalid = False    
    filelike.seek(0)  
    
    if filelike.read(len(PDF_MAGIC)) != PDF_MAGIC:
        return False
    else:
        filelike.seek(0)
    try:
        parser = PDFParser(filelike)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')
        if doc.is_extractable:
            isvalid = True
    except PDFException as excobj:
        logger.warning("pdf has valid header but, still not valid pdf, exception was %r" %(excobj))
        isvalid = False
            
    filelike.seek(0)
    return isvalid
コード例 #19
0
ファイル: pdf.py プロジェクト: staffanm/protokollen
    def get_metadata(self):
        """Returns metadata from both
    	   the info field (older PDFs) and XMP (newer PDFs).
           Return format is a .modules.metadata.Metadata object
    	"""
        file_pointer = open(self.path, 'rb')
        parser = PDFParser(file_pointer)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize()
        metadata = Metadata()
        for i in doc.info:
            metadata.add(i)
        if 'Metadata' in doc.catalog:
            xmp_metadata = resolve1(doc.catalog['Metadata']).get_data()
            xmp_dict = xmp_to_dict(xmp_metadata)
            #Let's add only the most useful one
            if "xap" in xmp_dict:
                metadata.add(xmp_dict["xap"])
            if "pdf" in xmp_dict:
                metadata.add(xmp_dict["pdf"])
            if "dc" in xmp_dict:
                metadata.add(xmp_dict["dc"], metadataType="dc")
        file_pointer.close()

        self.metadata = metadata
        return metadata
コード例 #20
0
def initialize_pdf_miner(fh):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fh)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument()
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.set_parser(parser)
    # Supply the password for initialization.
    # (If no password is set, give an empty string.)
    doc.initialize("")
    # Check if the document allows text extraction. If not, abort.
    if not doc.is_extractable:
        raise ValueError("PDFDocument is_extractable was False.")
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    # for page in doc.get_pages():
    #    interpreter.process_page(page)

    # Set parameters for analysis.
    laparams = LAParams()
    laparams.word_margin = 0.0
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    return doc, interpreter, device
コード例 #21
0
def parse_pdf(pdf_url):

    remote_file = urllib.request.urlopen(pdf_url).read()
    memory_file = io.BytesIO(remote_file)
    parser = PDFParser(memory_file)
    doc = PDFDocument()
    parser.set_document(doc)
    #Warning sometimes, error in pdf?
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    ret = []
    # Process each page contained in the document.
    for pageIdx, page in enumerate(doc.get_pages()):
        ret.append([])
        interpreter.process_page(page)
        layout = device.get_result()
        for idx, lt_obj in enumerate(layout):
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                if len(lt_obj.get_text().strip()) > 0:
                    ret[pageIdx].append((lt_obj.get_text().splitlines()))
    return ret
コード例 #22
0
ファイル: pdfinfo.py プロジェクト: larscwallin/pdfdig
    def get_toc(self):
        fp = open(self.pdf, 'rb')
        parser = PDFParser(fp)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')
        
        # title
        if doc.info:
            metadict = doc.info[0]
            if 'Title' in metadict.keys():
                self.title = normalize_title(metadict['Title'])

        # level 1 of toc
        try:
            outlines = doc.get_outlines()
            toc = list()
            select_level = self.get_level1(outlines)
        except:
            return None
        for (level,title,dest,a,se) in doc.get_outlines():
            if level==select_level:
                toc.append(normalize_toc_item(title))
        return toc
コード例 #23
0
class PdfSerializer(object):
    def __init__(self, filename):
        self.__filename = filename

        fp = open(self.__filename, 'rb')
        parser = PDFParser(fp)
        self.__doc = PDFDocument()
        parser.set_document(self.__doc)
        self.__doc.set_parser(parser)
        self.__doc.initialize('')

    def writeToTxt(self):
        text = self.getString()
        txtFile = open(self.__filename.replace(".pdf", ".txt"), "w")
        txtFile.write(text.encode('ascii','replace').decode("utf-8"))
        txtFile.close()

    def getString(self):
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        string = StringIO()
        device = TextConverter(rsrcmgr, string, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in self.__doc.get_pages():
            interpreter.process_page(page)
        return string.getvalue()
コード例 #24
0
ファイル: pdf.py プロジェクト: hsoft/pdfmasher
def extract_text_elements_from_pdf(path, j=nulljob):
    """Opens a PDF and extract every element that is text based (LTText).
    """
    fp = open(path, 'rb')
    doc = PDFDocument(caching=True)
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize()
    rsrcmgr = PDFResourceManager()
    laparams = LAParams(all_texts=True, paragraph_indent=5, heuristic_word_margin=True)
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = []
    all_elements = []
    enumerated_pages = list(enumerate(doc.get_pages()))
    progress_msg = "Reading page %i of %i"
    for pageno, page in j.iter_with_progress(enumerated_pages, progress_msg):
        interpreter.process_page(page)
        page_layout = device.get_result()
        pages.append(Page(page_layout.width, page_layout.height))
        textboxes = extract_textboxes(page_layout)
        elements = [create_element(box) for box in textboxes]
        merge_oneletter_elems(elements)
        for i, elem in enumerate(elements):
            elem.page = pageno
            elem.order = i
        all_elements += elements
    return pages, all_elements
コード例 #25
0
ファイル: dumppdf.py プロジェクト: frid/PythonPool
def dumppdf(outfp, fname, objids, pagenos, password='',
            dumpall=False, codec=None):
  doc = PDFDocument()
  fp = file(fname, 'rb')
  parser = PDFParser(doc, fp)
  doc.initialize(password)
  if objids:
    for objid in objids:
      obj = doc.getobj(objid)
      if isinstance(obj, PDFStream) and codec == 'raw':
        outfp.write(obj.get_rawdata())
      elif isinstance(obj, PDFStream) and codec == 'binary':
        outfp.write(obj.get_data())
      else:
        dumpxml(outfp, obj, codec=codec)
  if pagenos:
    for (pageno,page) in enumerate(doc.get_pages()):
      if pageno in pagenos:
        dumpxml(outfp, page.attrs)
  if dumpall:
    dumpallobjs(outfp, doc, codec=codec)
  if (not objids) and (not pagenos) and (not dumpall):
    dumptrailers(outfp, doc)
  fp.close()
  if codec not in ('raw','binary'):
    outfp.write('\n')
  return
コード例 #26
0
class PDFController(object):
    def __init__(self, fd=None, password=''):
        self.fd = fd
        self.password = password
        self.parsed = False
        self.document = PDFDocument()
        self.laparams = LAParams()
        self.rsrcmgr = PDFResourceManager()
        self.device = PDFPageAggregator(self.rsrcmgr, laparams=self.laparams)
        self.layout = []

        if fd:
            self.open(fd, password)

    def open(self, fd, password=''):
        self.password = password
        self.fd = fd if hasattr(fd, 'read') else open(fd)

    def close(self):
        if self.fd:
            self.fd.close()
            self.fd = None
        self.parsed = False

    def parse(self):
        parser = PDFParser(self.fd)
        parser.set_document(self.document)
        self.document.set_parser(parser)
        self.document.initialize(self.password)
        if not self.document.is_extractable:
            self.fd.close()
            raise PDFTextExtractionNotAllowed

        if not self.layout:
            self.layout = self._get_layout()

        self.parsed = True

    def _get_layout(self):
        layout = []
        interpreter = PDFPageInterpreter(self.rsrcmgr, self.device)
        for page in self.document.get_pages():
            interpreter.process_page(page)
            layout = self.device.get_result()
        return layout

    def lookup_term(self, term, ignore_case=True):
        layout_list = list(self.layout)
        indexes = [ i for i, v in enumerate(layout_list)
                    if hasattr(v, 'get_text') and (term.lower() if ignore_case else term) in
                        (v.get_text().lower() if ignore_case else v.get_text()) ]
        return indexes

    def __del__(self):
        self.fd.close()

    def __repr__(self):
        return '<PDFController> %s, %s' % ('Open file "%s"' % self.fd.name if self.fd else 'No file opened',
                                             'not parsed' if not self.parsed else 'parsed')
コード例 #27
0
ファイル: showcells.py プロジェクト: aliounedia/scraptils
def pdf2csv(fp):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument()
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.set_parser(parser)
    # Supply the password for initialization.
    # (If no password is set, give an empty string.)
    doc.initialize('')
    # Check if the document allows text extraction. If not, abort.
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Set parameters for analysis.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    for pageno, page in enumerate(doc.get_pages()):
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        #import code; code.interact(local=locals());
        hlines=[]
        vlines=[]
        for i in layout:
            if not type(i) in (LTRect, LTLine): continue
            hlines.append(int(i.x0))
            hlines.append(int(i.x1))
            vlines.append(int(layout.height - i.y0))
            vlines.append(int(layout.height - i.y1))
        hlines=filterclose(sorted(set(hlines)))
        vlines=filterclose(sorted(set(vlines)))
        print hlines
        print vlines
        print (layout.width, layout.height)
        i=0
        im = Image.new('1', (int(layout.width), int(layout.height)))
        draw = ImageDraw.Draw(im)
        while(i<len(vlines)-1):
            if not vlines[i+1]-vlines[i]>5:
                i=i+1
                continue
            j=0
            while(j<len(hlines)-1):
                if not hlines[j+1]-hlines[j]>5:
                    j=j+1
                    continue
                draw.rectangle([(int(hlines[j]),int(vlines[i])),(int(hlines[j+1]),int(vlines[i+1]))], outline=1)
                j=j+1
            i=i+1
        del draw
        fp=open("out%s.png" % pageno,'wb')
        im.save(fp,"PNG")
        fp.close()
コード例 #28
0
def open_pdf(filename, password=''):
  fp = open(filename, 'rb')
  parser = PDFParser(fp)
  doc = PDFDocument(caching=True)
  parser.set_document(doc)
  doc.set_parser(parser)
  doc.initialize(password)
  return doc
コード例 #29
0
def pdf2txt(pdf_file_name):
    #
    #open the pdf file in read bytes mode
    #
    try:
        fp = open(pdf_file_name , 'rb')
    except Exception as Argument:
        #log the error or warning in logfile
        logging.info("WARNING found while opening the PDF file '" + pdf_file_name + "' of the format Textbox")
        logging.warning(traceback.format_exc())
        return
            
    
    #create a parser object which is associated with the file object
    parser = PDFParser(fp)
    
    #create a PDFDocument objecct that stores the document strcuture
    doc = PDFDocument()
    
    #connect the parser and document objects
    parser.set_document(doc)
    doc.set_parser(parser)
    
    #supply the password here, if the PDF is protected
    try:
        doc.initialize('')
    except Exception as Argument:
        #log the error or warning in logfile
        logging.info("WARNING found while opening the PDF file '" + pdf_file_name + "' of the format Textbox")
        logging.warning(traceback.format_exc())
        return
    
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    
    # Set parameters for analysis.
    laparams = LAParams()
    
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams = laparams)
    
    # Create a PDF interpreter object
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    
    # Process each page contained in the document.
    for page in doc.get_pages():
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        #String to store the entire text
        textEtractedFromTable = ""
        #
        # The text extracted from the PDF file is returned to Main Table Module as a string
        #
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                textEtractedFromTable += (lt_obj.get_text())
        return textEtractedFromTable
コード例 #30
0
ファイル: MyPdfMiner.py プロジェクト: i11uminator/bookservice
 def GetTOC(self, doc, *args):
     fp = open(self.filepath, 'rb')
     parser = PDFParser(fp)
     doc = PDFDocument()
     parser.set_document(doc)
     doc.set_parser(parser)
     doc.initialize(self.password)
     outlines = doc.get_outlines()
     return outlines
コード例 #31
0
ファイル: plan1000.py プロジェクト: leisun123/scholar-private
def parse():
    fp = open(path, 'rb') # 以二进制读模式打开
    #用文件对象来创建一个pdf文档分析器
    praser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器 与文档对象
    praser.set_document(doc)
    doc.set_parser(praser)

    # 提供初始化密码
    # 如果没有密码 就创建一个空的字符串
    doc.initialize()

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf 资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # 循环遍历列表,每次处理一个page的内容
        for page in doc.get_pages():  # doc.get_pages() 获取page列表
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open(r'./1.txt', 'a') as f:
                        results = x.get_text()
                        print(results)
                        f.write(results + '\n')
コード例 #32
0
def readPDF(path, topath):
    #以二进制形式打开PDF文件
    f = open(path, "rb")
    #创建一个PDF文档分析器
    parser = PDFParser(f)
    #创建一个PDF文档
    pdfFile = PDFDocument()

    #连接分析器与文档对象
    parser.set_document(pdfFile)
    #提供初始化密码
    pdfFile.initialize()

    #检测文档是否提供txt转换
    if not pdfFile.is_extractable:
        raise PDFTextExtractionNotAllowed  #不能转换,结束
    else:
        #解析数据
        #数据管理器
        manager = PDFResourceManger()
        #创建一个PDF设备的对象
        laparams = LAParams()
        device = PDFPageAggregator(manager, laparams=laparams)
        #解释器对象
        interpreter = PDFPageInterpreter(manager, device)
        #开始循环处理,每次处理一页
        for page in pdfFile.get_pages():
            interpreter.progcess_page(page)
            #处理图层
            layout = device.get_result()
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    #处理每行数据
                    with open(topath, "a") as f:
                        str = x.get_text()
                        #print(str)
                        f.write(str + "\n")
コード例 #33
0
 def pdfparse(url, name):
     res = s.get(url, headers={"user-agent": generate_user_agent()})
     path1 = os.getcwd() + "\\%s.pdf" % name.split(".")[0]
     #        path2 = os.getcwd()+"\\%s.txt"%name.split(".")[0]
     with open(path1, 'wb') as f:
         f.write(res.content)
     f = open(path1, 'rb')
     praser = PDFParser(f)
     doc = PDFDocument()
     praser.set_document(doc)
     doc.set_parser(praser)
     f.close()
     doc.initialize()
     if not doc.is_extractable:
         raise PDFTextExtractionNotAllowed
     else:
         # 创建PDf 资源管理器 来管理共享资源
         rsrcmgr = PDFResourceManager()
         # 创建一个PDF设备对象
         laparams = LAParams()
         device = PDFPageAggregator(rsrcmgr, laparams=laparams)
         # 创建一个PDF解释器对象
         interpreter = PDFPageInterpreter(rsrcmgr, device)
         text = ''
         # 循环遍历列表,每次处理一个page的内容
         for page in doc.get_pages():  # doc.get_pages() 获取page列表
             interpreter.process_page(page)
             # 接受该页面的LTPage对象
             layout = device.get_result()
             #text = "".join(map(lambda x:x.get_text().strip(" ") if x.get_text() else "",layout))
             #print(text)
             # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
             for x in layout:
                 results = x.get_text()
                 if results:
                     text = text + results.strip('\n')
         return text
コード例 #34
0
    def readPDF(self, path, callback=None, toPath=''):
        f = open(path, 'rb')  # 以二进制可读形式打开pdf文件,'rb'
        parser = PDFParser(f)  # 创建一个pdf文档分析器
        pdfFile = PDFDocument()  # 创建pdf文档
        parser.set_document(pdfFile)  # 链接文档对象与分析器
        pdfFile.set_parser(parser)  # 链接分析器与文档对象
        pdfFile.initialize('')  # 提供初始化密码
        # 检测文档是否提供txt转换
        if not pdfFile.is_extractable:  #
            raise PDFTextExtractionNotAllowed
        else:
            # 解析数据
            # #数据管理器
            manager = PDFResourceManager()
            # 创建一个PDF设备对象
            laparams = LAParams()
            device = PDFPageAggregator(manager, laparams=laparams)
            # 创建解释器对象
            interpreter = PDFPageInterpreter(manager, device)

            # 开始循环处理,每次处理一页,只能把文本读出来,图片读不出
            for page in pdfFile.get_pages():
                interpreter.process_page(page)
                layout = device.get_result()
                for x in layout:  # 循环处理图层
                    if isinstance(x, LTTextBoxHorizontal
                                  ):  # 判断图层类型为LTTextBoxHorizontal才可以进行读取
                        if toPath == '':
                            #处理每行数据
                            str = x.get_text()
                            if callback != None:
                                callback(str)
                            else:
                                print(str)
                        else:
                            #写文件
                            print('将PDF数据写入文件')
コード例 #35
0
 def noimgpdf_change_word(self, _path):
     """
     没有图片的pdf文件转word
     :param _path: pdf文件路径
     :return:
     """
     try:
         if 'http://www' in _path:
             re = Request(
                 url=_path,
                 headers={'User-Agent': random.choice(self.user_agent)})
             fp = urlopen(re)  # 打开在线PDF文档
         else:
             fp = open(_path, 'rb')  # 打开本地pdf文档
         praser_pdf = PDFParser(fp)
         doc = PDFDocument()
         praser_pdf.set_document(doc)
         doc.set_parser(praser_pdf)
         doc.initialize()
         if not doc.is_extractable:
             raise PDFTextExtractionNotAllowed
         else:
             rsrcmgr = PDFResourceManager()
             laparams = LAParams()
             device = PDFPageAggregator(rsrcmgr, laparams=laparams)
             interpreter = PDFPageInterpreter(rsrcmgr, device)
             all_results = ''
             for page in doc.get_pages():
                 interpreter.process_page(page)
                 layout = device.get_result()
                 for out in layout:
                     if isinstance(out, LTTextBoxHorizontal):
                         results = out.get_text()
                         all_results += results
             return all_results
     except:
         return None
コード例 #36
0
def process(path):
    aud = cur = dat = gen = genlong = geo = nam = 0

    fp = open(path, 'rb')
    praser = PDFParser(fp)
    doc = PDFDocument()
    praser.set_document(doc)
    doc.set_parser(praser)
    doc.initialize()
    fp.close()

    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()

            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    results = x.get_text().lower()
                    list = results.split()
                    for part in list:
                        aud += count_word(part, auditor)
                        cur += count_word(part, currency)
                        dat += count_word(part, datesand)
                        gen += count_word(part, generic)
                        genlong += count_word(part, genericlong)
                        geo += count_word(part, geographic)
                        nam += count_word(part, names)
    return [aud, cur, dat, gen, genlong, geo, nam]
コード例 #37
0
ファイル: test.py プロジェクト: 10000lance/pdf
def parsePDF(pdfPath, pdfPwd='', imgFolderPath='/tmp', saveImgs=False):
    """Process each of the pages in this pdf file and return a list of strings representing the text found in each page"""
    if not os.path.exists(imgFolderPath):
        os.makedirs(imgFolderPath)

    try:
        #打开pdf文档
        fp = open(pdfPath, 'rb')
        #创建pdf解析器
        parser = PDFParser(fp)
        #创建pdf存储器
        doc = PDFDocument()

        #关联pdf解析器和存储器
        parser.set_document(doc)
        doc.set_parser(parser)

        #初始化
        doc.initialize(pdfPwd)

        if doc.is_extractable:
            #处理pdf文档
            text = parsePages(doc, imgFolderPath, saveImgs=saveImgs)

            with open('{0}/text.txt'.format(imgFolderPath),
                      'w',
                      encoding='utf-8') as f:
                for line in text:
                    f.write(line)
                f.close()

        # close the pdf file
        fp.close()
    except IOError:
        # the file doesn't exist or similar problem
        pass
コード例 #38
0
def parse(pdf_path, txt_path):
    fp = open(pdf_path, 'rb')

    parser = PDFParser(fp)

    doc = PDFDocument()

    parser.set_document(doc)
    doc.set_parser(parser)

    doc.initialize()

    if not doc.is_extractable:
        print('走了')
        raise PDFTextExtractionNotAllowed
    else:

        mgr = PDFResourceManager()

        laparams = LAParams()

        device = PDFPageAggregator(mgr, laparams=laparams)

        interpreter = PDFPageInterpreter(mgr, device)

        for page in doc.get_pages():
            interpreter.process_page(page)

            layout = device.get_result()

            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open(txt_path, 'a') as f:
                        results = x.get_text()
                        print(results)
                        f.write(results + "\n")
コード例 #39
0
ファイル: pdf2txt.py プロジェクト: helloGitHub1993/pdf2txt
def parse(path):
    tmp = file[i].split('/')
    #The output files will be kept in "res" directory,so please make a "res" directory before run this program
    res_name = './res/' + os.path.splitext(tmp[len(tmp) - 1])[0] + '.txt'
    fp = open(path, 'rb')
    #Create a pdf parser
    praser = PDFParser(fp)
    # Create a pdf doc
    doc = PDFDocument()
    praser.set_document(doc)
    doc.set_parser(praser)

    # init
    doc.initialize()

    #Check whether the doc provides TXT conversion
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # Create pdf resource manager
        rsrcmgr = PDFResourceManager()
        # Create a pdf device object
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # Create a pdf interpreter object
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open(res_name, 'a', encoding='utf-8') as f:
                        results = x.get_text()
                        print(results)
                        f.write(results + '\n')
コード例 #40
0
ファイル: pdfconv.py プロジェクト: gavinlwz/pdfconv
def parse(inpath, outpath):
    remove(TMPDIR) # 清除临时目录 
    os.mkdir(TMPDIR)
    remove(outpath) # 清除输出文件
    fp = open(inpath, 'rb')
    praser = PDFParser(fp) # pdf文档分析器
    doc = PDFDocument() # 创建一个PDF文档
    praser.set_document(doc) # 连接分析器与文档对象
    doc.set_parser(praser)
    doc.initialize()
    
    if not doc.is_extractable: # 是否提供txt转换
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager() # 创建PDF资源管理器
        laparams = LAParams() 
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device) # 创建PDF解释器对象
                
        for idx,page in enumerate(doc.get_pages()): # 获取page列表
            interpreter.process_page(page)
            layout = device.get_result()
            print("parse", idx)
            parse_section(layout, outpath)
コード例 #41
0
def pdf2txt(path):
    fp = open(path, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    '''
    laparams = LAParams()
    laparams.char_margin = 1.0
    laparams.word_margin = 1.0
    '''
    laparams = LAParams()
    for param in (
            "all_texts", "detect_vertical", "word_margin", "char_margin",
            "line_margin",
            "boxes_flow"):
        paramv = locals().get(param, None)
        if paramv is not None:
            setattr(laparams, param, paramv)

    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    extracted_text = ''

    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                extracted_text += lt_obj.get_text()

    fp.close()

    return extracted_text
コード例 #42
0
    def extract_text_from_pdf(self):
        if (self.page_end == 0):
            self.page_end = self.page_beg

        fp = open(self.filepath_in + '/' + self.nom_fichier, 'rb')
        parser = PDFParser(fp)
        doc = PDFDocument()

        parser.set_document(doc)
        doc.set_parser(parser)

        doc.initialize('')

        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        laparams.char_margin = 4.0  # 2.0 by default :  two char whose distance is closer than this value are considered contiguous and get grouped into one.
        laparams.word_margin = 0.3  # 0.1 by default : distance between two words is greater than this value => insert space
        laparams.line_margin = 0.5  # 0.5 by default : Distance between 2 Lines under this value are grouped

        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        extracted_text = ''

        x = list(doc.get_pages())
        for i in range(self.page_beg - 1, self.page_end):
            page = x[i]
            extracted_text += "EXTRACTION DE LA PAGE " + str(i + 1) + "\n\n"
            interpreter.process_page(page)
            layout = device.get_result()
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBox) or isinstance(
                        lt_obj, LTTextLine):
                    extracted_text += lt_obj.get_text()
                    extracted_text += "\n"

        return extracted_text
コード例 #43
0
def parsePDF(pathPDF, pathText, fname):

	outfile = open(str(os.path.join(pathText, fname))[0:-4] + '.txt', 'w+', encoding='utf-8')

	fp = open(str(os.path.join(pathPDF, fname)), 'rb')
	parser = PDFParser(fp)
	doc = PDFDocument()
	parser.set_document(doc)
	doc.set_parser(parser)
	doc.initialize('')
	rsrcmgr = PDFResourceManager()
	laparams = LAParams()
	device = PDFPageAggregator(rsrcmgr, laparams=laparams)
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	# Process each page contained in the document.
	for page in doc.get_pages():
	    interpreter.process_page(page)
	    layout = device.get_result()
	    for lt_obj in layout:
	        if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
	            #print(lt_obj.get_text())
	            outfile.write(lt_obj.get_text())
	            #outfile.write(lt_obj.get_text())
	    outfile.write ('=' * 100 + '\n')
コード例 #44
0
def parsePDFByURLandTokenize_PDFMiner(url):
    file = urllib.request.urlopen(url).read()

    if file is not None:

        memory = io.BytesIO(file)
        parser = PDFParser(memory)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')
        rsrcmgr = PDFResourceManager()
        laparams = pdfminer.layout.LAParams()

        #sets the layout analyzer params so we can extract the text with whitespaces
        for param in ("all_texts", "detect_vertical", "word_margin",
                      "char_margin", "line_margin", "boxes_flow"):
            paramv = locals().get(param, None)
            if paramv is not None:
                setattr(laparams, param, paramv)

        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        extracted_text = ''

        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBox) or isinstance(
                        lt_obj, LTTextLine):
                    extracted_text += lt_obj.get_text()

        return word_tokenize(extracted_text)

    return None
コード例 #45
0
def Pdf_generation_TF(f, qaStatus=False):
    allSentances = []
    num_words = 0
    parser = PDFParser(f)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    NativeallWords = []
    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                num_words += len(lt_obj.get_text().split())
                lt_obj.get_text().encode("utf8")
                rawSentance = str(lt_obj)
                # allSentances=sentanceGernaration(rawSentance)
                endPoint = rawSentance.rfind("\\n") - 1
                rawSentance = rawSentance[55:endPoint].replace('\\n',
                                                               '').replace(
                                                                   '\\s', '')
                allSentances.extend(rawSentance.split('.'))
                NativeallWords.extend(lt_obj.get_text().split())

    # print(NativeallWords)
    # print('Sentances from pdf',allSentances)
    if qaStatus:
        print('from PDF ')
        return allSentances
    return calc_TF(NativeallWords, num_words)
コード例 #46
0
    def _make_pages(self, fp):
        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)
        document = PDFDocument()
        parser.set_document(document)

        # Create a PDF document object that stores the document structure.
        # Supply the password for initialization.
        password = ""
        document.set_parser(parser)
        document.initialize(password)

        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()

        # Set parameters for analysis.
        laparams = LAParams()

        # Create a PDF page aggregator object.
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        pages = list(document.get_pages())
        return (device, interpreter, pages, rsrcmgr)
コード例 #47
0
def getTextFromFirstPage(filename):
    fp = open(filename, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    laparams.char_margin = 1.0
    laparams.word_margin = 1.0
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    extracted_text = ''
    po = None
    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                extracted_text += lt_obj.get_text()
        break
    fp.close()
    return extracted_text
コード例 #48
0
def convert_pdf_2_text(path, name):
    parser = PDFParser(open(path + name, "rb"))
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)

    doc.initialize()

    if (not doc.is_extractable):
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open(path + name[:-4] + ".txt", 'a') as f:
                        results = x.get_text()
                        print(results)
                        f.write(results + "\n")
コード例 #49
0
def parse():
    fp = open(path, 'rb')
    praser = PDFParser(fp)
    doc = PDFDocument()
    praser.set_document(doc)
    doc.set_parser(praser)
    doc.initialize()
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open('/Users/liamtheron/Desktop/Deloiite/test.txt',
                              'a') as f:
                        results = x.get_text()
                        f.write(results)
                        f.write('\n')
コード例 #50
0
ファイル: Pdf2Text.py プロジェクト: bokmani/PDfTranslate
def parse(file_name, target_name):
    fp = open(file_name, 'rb')
    praser = PDFParser(fp)
    doc = PDFDocument()
    praser.set_document(doc)
    doc.set_parser(praser)

    doc.initialize()

    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        page_number = 1
        for page in doc.get_pages():
            print('page: ' + str(page_number))
            interpreter.process_page(page)
            layout = device.get_result()
            # 这里layout是一个LTPage对象,里面存放着这个page解析出的各种对象
            # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等
            # 想要获取文本就获得对象的text属性
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open(target_name, 'a') as f:
                        results = x.get_text()
                        translate_text = translate(results)
                        f.write(translate_text + '\n')
                # if (isinstance(x, LTImage)):
                #     with open('patternColoring.txt', 'a') as f:
                #         results = x.get_image()
                #         f.write('###########\n' + results + '\n')
            page_number += 1
コード例 #51
0
def readPDF(path, topath):
    # 以二进制形式打开pdf文件
    f = open(path, 'rb')
    # 创建pdf文档分析器
    parser = PDFParser(f)
    # 创建pdf文档
    pdfFile = PDFDocument()
    # 连接分析器与文档对象
    parser.set_document(pdfFile)
    pdfFile.set_parser(parser)
    # 提供初始化密码
    pdfFile.initialize()
    # 检测文档是否提供txt转换
    if not pdfFile.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 解释数据
        # 数据管理器
        manager = PDFResourceManager()
        # 创建一个pdf设备对象
        laparams = LAParams()
        device = PDFPageAggregator(manager, laparams=laparams)
        # 解释器对象
        interpreter = PDFPageInterpreter(manager, device)

        # 开始循环处理,每次处理一页
        for page in pdfFile.get_pages():
            interpreter.process_page(page)
            # 获取图层
            layout = device.get_result()
            for x in layout:
                # isinstance函数是用来判断一个对象的变量类型
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open(toPath, 'a') as f:
                        str1 = x.get_text()
                        f.write(str1 + "\n")
コード例 #52
0
ファイル: PdfToTxt.py プロジェクト: highlightyys/edgs
  def readPDF(path, toPath):
      # 以二进制形式打开pdf文件
 
      with open(path, "rb") as f:
          # 创建一个pdf文档分析器
          parser = PDFParser(f)
          # 创建pdf文档
          pdfFile = PDFDocument()
          # 链接分析器与文档对象
          parser.set_document(pdfFile)
          pdfFile.set_parser(parser)
          # 提供初始化密码
          pdfFile.initialize()
          # 检测文档是否提供txt转换
      if not pdfFile.is_extractable:
          raise PDFTextExtractionNotAllowed
      else:
          # 解析数据
          # 数据管理
          manager = PDFResourceManager()
          # 创建一个PDF设备对象
          laparams = LAParams()
          device = PDFPageAggregator(manager, laparams=laparams)
          # 解释器对象
          interpreter = PDFPageInterpreter(manager, device)
  
          # 开始循环处理,每次处理一页
          for page in pdfFile.get_pages():
              interpreter.process_page(page)
              layout = device.get_result()
              for x in layout:
                  if(isinstance(x, LTTextBoxHorizontal)):
                      with open(toPath, 'a',encoding='utf-8') as f:
                          str = x.get_text()
                          # print(str)
                          f.write(str+"\n")
コード例 #53
0
ファイル: szse_test.py プロジェクト: ziyi21/python_project
def parse(read_path):
    fp = open(read_path, 'rb') # 以二进制读模式打开
    #用文件对象来创建一个pdf文档分析器
    praser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器 与文档对象
    praser.set_document(doc)
    doc.set_parser(praser)
    # 提供初始化密码
    # 如果没有密码 就创建一个空的字符串
    doc.initialize()
    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf 资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)
    return doc,interpreter,device
コード例 #54
0
def pdf_to_string(pdf_file):
    fp = open(pdf_file, 'rb')

    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    laparams.line_margin = 0.3
    laparams.word_margin = 0.3
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    extracted_text = ''

    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, (LTTextBox, LTTextLine)):
                extracted_text += lt_obj.get_text()

    return extracted_text
コード例 #55
0
ファイル: dumppdf.py プロジェクト: frid/PythonPool
def dumpoutline(outfp,
                fname,
                objids,
                pagenos,
                password='',
                dumpall=False,
                codec=None):
    doc = PDFDocument()
    fp = file(fname, 'rb')
    parser = PDFParser(doc, fp)
    doc.initialize(password)
    pages = dict(
        (page.pageid, pageno) for (pageno, page) in enumerate(doc.get_pages()))
    for (level, title, dest, a, se) in doc.get_outlines():
        pageno = None
        if dest:
            dest = resolve1(doc.lookup_name('Dests', dest))
            if isinstance(dest, dict):
                dest = dest['D']
            pageno = pages[dest[0].objid]
        outfp.write(repr((level, title, dest, pageno)) + '\n')
    parser.close()
    fp.close()
    return
コード例 #56
0
def parse_pdf(path):
    fp = open(path, 'rb')  # 以二进制读模式打开
    praser = PDFParser(fp)
    doc = PDFDocument()
    praser.set_document(doc)
    doc.set_parser(praser)

    doc.initialize()

    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in doc.get_pages():  # doc.get_pages() 获取page列表
            interpreter.process_page(page)
            layout = device.get_result()
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    results = x.get_text()
                    if results[:5] == "准考证号:":
                        return results[5:].replace("\n", "")
コード例 #57
0
    def load_file_text(self, import_file):
        """ Import individual file types of odt, docx txt, pdf, html, htm
        """

        text = ""

        # Import from odt
        if import_file[-4:].lower() == ".odt":
            text = self.convert_odt_to_text(import_file)
        # Import from docx
        if import_file[-5:].lower() == ".docx":
            #text = convert(importFile)  # uses docx_to_html
            document = opendocx(import_file)
            list_ = getdocumenttext(document)
            text = "\n".join(list_)
        # Import from epub
        if import_file[-5:].lower() == ".epub":
            book = epub.read_epub(import_file)
            for d in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
                #print(d.get_content())
                bytes_ = d.get_body_content()
                string = bytes_.decode('utf-8')
                text += html_to_text(string) + "\n"
        # import PDF
        if import_file[-4:].lower() == '.pdf':
            fp = open(import_file, 'rb')  # read binary mode
            parser = PDFParser(fp)
            doc = PDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
            # potential error with encrypted PDF
            doc.initialize('')
            rsrcmgr = PDFResourceManager()
            laparams = LAParams()
            laparams.char_margin = 1.0
            laparams.word_margin = 1.0
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in doc.get_pages():
                interpreter.process_page(page)
                layout = device.get_result()
                for lt_obj in layout:
                    if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                        text += lt_obj.get_text()
        # import from html
        if import_file[-5:].lower() == ".html" or import_file[-4:].lower() == ".htm":
            importErrors = 0
            with open(import_file, "r") as sourcefile:
                fileText = ""
                while 1:
                    line = sourcefile.readline()
                    if not line:
                        break
                    fileText += line
                text = html_to_text(fileText)
                QtWidgets.QMessageBox.warning(None, 'Warning', str(importErrors) + " lines not imported")
        # Try importing as a plain text file.
        if text == "":
            import_errors = 0
            try:
                with open(import_file, "r") as sourcefile:
                    while 1:
                        line = sourcefile.readline()
                        if not line:
                            break
                        try:
                            text += line
                        except Exception as e:
                            #logger.debug("Importing plain text file, line ignored: " + str(e))
                            import_errors += 1
                    if text[0:6] == "\ufeff":  # associated with notepad files
                        text = text[6:]
            except Exception as e:
                QtWidgets.QMessageBox.warning(None, 'Warning', "Cannot import " + str(import_file) + "\n" + str(e))
                return
            if import_errors > 0:
                QtWidgets.QMessageBox.warning(None, 'Warning', str(import_errors) + " lines not imported")
                logger.warning(import_file + ": " + str(import_errors) + " lines not imported")
        # import of text file did not work
        if text == "":
            QtWidgets.QMessageBox.warning(None, 'Warning', "Cannot import " + str(import_file) + "\n" + str(e))
            return
        # Final checks: check for duplicated filename and update model, widget and database
        nameSplit = import_file.split("/")
        filename = nameSplit[-1]
        if any(d['name'] == filename for d in self.source):
            QtWidgets.QMessageBox.warning(None, 'Duplicate file', "Duplicate filename.\nFile not imported")
            return
        entry = {'name': filename, 'id': -1, 'fulltext': text, 'mediapath': None, 'memo': "",
        'owner': self.settings['codername'], 'date': datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
        cur = self.settings['conn'].cursor()
        #logger.debug("type fulltext: " + str(type(entry['fulltext'])))
        cur.execute("insert into source(name,fulltext,mediapath,memo,owner,date) values(?,?,?,?,?,?)",
            (entry['name'],  entry['fulltext'], entry['mediapath'], entry['memo'], entry['owner'], entry['date']))
        self.settings['conn'].commit()
        cur.execute("select last_insert_rowid()")
        id_ = cur.fetchone()[0]
        entry['id'] = id_
        self.parent_textEdit.append(entry['name'] + " imported.")
        self.source.append(entry)
コード例 #58
0
'''
使用pdfminer3k读取pdf文档
'''
import os
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams

IN_PUT_PATH = '肖申克的救赎.pdf'
OUT_PUT = '肖申克的救赎.txt'
fp = open(IN_PUT_PATH, 'rb')  # 打开一个pdf文档
parser = PDFParser(fp)  # 创建一个与pdf文档关联的解析器对象
doc = PDFDocument()  # 创建一个pdf文档对象, 用于存储文档结构
parser.set_document(doc)  # 将解析器与文档对象关联
doc.set_parser(parser)
doc.initialize('')  # 初始化文档

resource = PDFResourceManager()  # 创建一个pdf资源管理器对象,用于存储共享资源
laparam = LAParams()  # 参数分析器
device = PDFPageAggregator(resource, laparams=laparam)  # 创建pdf页面聚合器对象
interpreter = PDFPageInterpreter(resource, device)  # 创建pdf解释器对象

for page in doc.get_pages():  # 使用文档对象得到页面的集合
    interpreter.process_page(page)  # 使用页面解析器读取内容
    layout = device.get_result()  # 使用聚合器来获取内容
    for out in layout:
        if hasattr(out, 'get_text'):
            print(out.get_text())
            f = open(OUT_PUT, 'a+')
            f.write(out.get_text())
コード例 #59
0
def parsePdf(fp):
    '''解析PDF文本,并保存到TXT文件中'''
    # text_path = "./file02.pdf"
    # r = preview_pdf(s, contractNo=contractNo)
    # # try:
    # if r.status_code == 404 :
    #     print("404")
    #     return "文件格式为非pdf"
    # elif r.status_code == 500:
    #     print("500")
    #     return "文件格式同非pdf格式"
    # # except Exception as message:
    #
    # with open(text_path, mode="wb+") as f:
    #     f.write(r.content)
    #     f.close()
    #     # return text_path

    # fp = open(real_path, 'rb')
    # 用文件对象创建一个PDF文档分析器
    parser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器,与文档对象
    parser.set_document(doc)
    doc.set_parser(parser)

    #提供初始化密码,如果没有密码,就创建一个空的字符串
    doc.initialize()

    #检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        #创建PDF,资源管理器,来共享资源
        rsrcmgr = PDFResourceManager()
        #创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        #创建一个PDF解释其对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        #循环遍历列表,每次处理一个page内容
        # doc.get_pages() 获取page列表
        for page in doc.get_pages():
            interpreter.process_page(page)
            #接受该页面的LTPage对象
            layout = device.get_result()
            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象
            # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等
            # 想要获取文本就获得对象的text属性,
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    # with open(r'2.txt','a') as f:
                    results = x.get_text()
                    print(results)
                    # f.write(results  +"\n")
            print(type(results))
            print("ces")

    return results
コード例 #60
0
def read_pdf(resume_file_path, txt_output_path):
    resume_file = open(resume_file_path, 'rb')  # 以二进制读模式打开
    #用文件对象来创建一个pdf文档分析器
    praser = PDFParser(resume_file)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器 与文档对象
    praser.set_document(doc)
    doc.set_parser(praser)

    # 提供初始化密码
    # 如果没有密码 就创建一个空的字符串
    doc.initialize()

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf 资源管理器 来管理共享资源
        resource_magager = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(resource_magager, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(resource_magager, device)

        # 如果写入文件存在,则清空文件或者删除文件
        if (os.path.exists(txt_output_path)):
            os.remove(txt_output_path)
            print('exist and remove')

        # 循环遍历列表,每次处理一个page的内容
        for page in doc.get_pages():  # doc.get_pages() 获取page列表
            try:
                interpreter.process_page(page)
            except KeyError:
                error_info_string = str(sys.exc_info())
                print(resume_file_path + ":" + error_info_string)
                read_failed_logs(resume_file_path)
                break

            # 接受该页面的LTPage对象
            layout = device.get_result()

            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):

                    with open(txt_output_path,
                              'a') as f:  #    , encoding='utf-8'
                        results = x.get_text()

                        all_lines_string = bad_code_collection_read()
                        results = bad_code_clean(all_lines_string, results)

                        results = space_process(results)

                        results = recursive_process_UnicodeEncodeError(
                            f, results)

                        print(results)

    resume_file.close()