コード例 #1
1
def getPageLayouts(f1):
    '''Takes a pdf file object, f1, extracts the text-like objects, and returns'''
    try:
        '''The parser and doc pair for a "pipe" of sorts'''
        with open(fpath, 'rb') as f1:
            parser = PDFParser(f1)
            doc = PDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
            doc.initialize(pss_wd)

            # can we extract text?
            if doc.is_extractable:
                rsrcmgr = PDFResourceManager()
                laparams = LAParams()
                device = PDFPageAggregator(rsrcmgr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)

                page_layouts = []
                for page in doc.get_pages():
                    '''
                    I *think* we're actually calling on fp here, and not some
                    stored data; the idea is that .pdf files are "too big and
                    complicated" to load all at once, so why not just parse
                    what you need when you need it?
                    '''
                    interpreter.process_page(page)
                    # receive the LTPage object for the page
                    page_layouts.append(device.get_result())
    except IOError:
        raise IOError, "issue with loading file, please try again"
    finally:
        f1.close()
        return page_layouts
コード例 #2
0
ファイル: pdf.py プロジェクト: MikaYuoadas/Docbucket
class Pdf(object):

    def __init__(self, pdf_file):
        parser = PDFParser(pdf_file)
        self._doc = PDFDocument()
        parser.set_document(self._doc)
        self._doc.initialize
        self._doc.set_parser(parser)

    @property
    def pages(self):
        return len(tuple(self._doc.get_pages()))

    def to_text(self):
        rsrcmgr = PDFResourceManager()
        output = StringIO()
        laparams = LAParams()
        laparams.detect_vertical = True
        laparams.all_texts = True
        laparams.word_margin = 0.4
        device = TextConverter(rsrcmgr, output, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in self._doc.get_pages():
                interpreter.process_page(page)
        return output.getvalue().decode('utf-8', 'ignore')
コード例 #3
0
ファイル: pdftotext.py プロジェクト: mayhewsw/projects
def pdf_to_text(filename):
    from cStringIO import StringIO  
    from pdfminer.converter import LTChar, TextConverter    #<-- changed
    from pdfminer.layout import LAParams
    from pdfminer.pdfparser import PDFDocument, PDFParser
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter

    rsrc = PDFResourceManager()
    outfp = StringIO()
    device = TextConverter(rsrc, outfp, codec="utf-8", laparams=LAParams()) 

    doc = PDFDocument()
    fp = open(filename, 'rb')
    parser = PDFParser(fp)       
    parser.set_document(doc)     
    doc.set_parser(parser)       
    doc.initialize('')

    interpreter = PDFPageInterpreter(rsrc, device)

    print "There are: " + str(len(list(doc.get_pages()))) + " pages"

    for i, page in enumerate(doc.get_pages()):
        outfp.write("START PAGE %d\n" % i)
        if page is not None:
            interpreter.process_page(page)
        outfp.write("END PAGE %d\n" % i)

    device.close()
    fp.close()

    return outfp.getvalue()
コード例 #4
0
ファイル: dumppdf.py プロジェクト: frid/PythonPool
def dumppdf(outfp, fname, objids, pagenos, password='',
            dumpall=False, codec=None):
  doc = PDFDocument()
  fp = file(fname, 'rb')
  parser = PDFParser(doc, fp)
  doc.initialize(password)
  if objids:
    for objid in objids:
      obj = doc.getobj(objid)
      if isinstance(obj, PDFStream) and codec == 'raw':
        outfp.write(obj.get_rawdata())
      elif isinstance(obj, PDFStream) and codec == 'binary':
        outfp.write(obj.get_data())
      else:
        dumpxml(outfp, obj, codec=codec)
  if pagenos:
    for (pageno,page) in enumerate(doc.get_pages()):
      if pageno in pagenos:
        dumpxml(outfp, page.attrs)
  if dumpall:
    dumpallobjs(outfp, doc, codec=codec)
  if (not objids) and (not pagenos) and (not dumpall):
    dumptrailers(outfp, doc)
  fp.close()
  if codec not in ('raw','binary'):
    outfp.write('\n')
  return
コード例 #5
0
ファイル: PDF_Parser.py プロジェクト: samdavey/Random
    def load( self, open_file ):
        self.fields = {}
        self.text= {}

        # Create a PDF parser object associated with the file object.
        parser = PDFParser(open_file)
        # Create a PDF document object that stores the document structure.
        doc = PDFDocument()
        # Connect the parser and document objects.
        parser.set_document(doc)
        doc.set_parser(parser)
        # Supply the password for initialization.
        # (If no password is set, give an empty string.)
        doc.initialize('')
        # Check if the document allows text extraction. If not, abort.
        if not doc.is_extractable:
            raise PDFTextExtractionNotAllowed
        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()
        # Set parameters for analysis.
        laparams = LAParams()
        # Create a PDF page aggregator object.
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # Process each page contained in the document.
        for pgnum, page in enumerate( doc.get_pages() ):
            interpreter.process_page(page)
            if page.annots:
                self._build_annotations( page )
            txt= self._get_text( device )
            self.text[pgnum+1]= txt
コード例 #6
0
ファイル: PdfParser.py プロジェクト: hcouch21/styloproject
    def parse(self, path):
		out = StringIO.StringIO()
		fp = None
        # Directory
		if os.path.isdir(path):
			raise NotImplementedError()
        # File
	       	else:
			fp = file(path)		
		rsrc = PDFResourceManager()
		codec = 'utf-8'
		laparams = LAParams()
		laparams.char_margin = 2.0
		laparams.line_margin = 2.0
		laparams.word_margin = 0.0
		device = TextConverter(rsrc, out, codec=codec, laparams=laparams)
		doc = PDFDocument()
		parser = PDFParser(fp)
		parser.set_document(doc)
		doc.set_parser(parser)
		doc.initialize()
		interpreter = PDFPageInterpreter(rsrc, device)
		for page in doc.get_pages():
			interpreter.process_page(page)
		device.close()
		sample = Sample(path, None, out.getvalue())
		out.close()
		return sample
コード例 #7
0
ファイル: dumppdf.py プロジェクト: Adniel/ComparePdf
def dumppdf(outfp, fname, objids, pagenos, password='',
            dumpall=False, codec=None):
    doc = PDFDocument()
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)
    if objids:
        for objid in objids:
            obj = doc.getobj(objid)
            dumpxml(outfp, obj, codec=codec)
    if pagenos:
        for (pageno,page) in enumerate(doc.get_pages()):
            if pageno in pagenos:
                if codec:
                    for obj in page.contents:
                        obj = stream_value(obj)
                        dumpxml(outfp, obj, codec=codec)
                else:
                    dumpxml(outfp, page.attrs)
    if dumpall:
        dumpallobjs(outfp, doc, codec=codec)
    if (not objids) and (not pagenos) and (not dumpall):
        dumptrailers(outfp, doc)
    fp.close()
    if codec not in ('raw','binary'):
        outfp.write('\n')
    return
コード例 #8
0
def extractContent(file):
    print "extractContent"

    fp = open(file, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)

    rsrcmgr = PDFResourceManager()
    codec = 'UTF-8'
    laparams = LAParams()
    outfp = StringIO.StringIO()

    device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    #if not doc.is_extractable:
    #    return None

    for i, page in enumerate(doc.get_pages()):
        print "page=" + str(i)
        if page is not None:
            interpreter.process_page(page)
    print "EOF"
    device.close()
    fp.close()

    return outfp.getvalue()
コード例 #9
0
class PdfSerializer(object):
    def __init__(self, filename):
        self.__filename = filename

        fp = open(self.__filename, 'rb')
        parser = PDFParser(fp)
        self.__doc = PDFDocument()
        parser.set_document(self.__doc)
        self.__doc.set_parser(parser)
        self.__doc.initialize('')

    def writeToTxt(self):
        text = self.getString()
        txtFile = open(self.__filename.replace(".pdf", ".txt"), "w")
        txtFile.write(text.encode('ascii','replace').decode("utf-8"))
        txtFile.close()

    def getString(self):
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        string = StringIO()
        device = TextConverter(rsrcmgr, string, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in self.__doc.get_pages():
            interpreter.process_page(page)
        return string.getvalue()
コード例 #10
0
ファイル: autosumpdf.py プロジェクト: suriyan/autosum
def convert_pdf_to_txt(path):

    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()

    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    with open(path, 'rb') as fp:
        parser = PDFParser(fp)

        doc = PDFDocument(caching=True)
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')

        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.

        for page in doc.get_pages():
            interpreter.process_page(page)
        text = retstr.getvalue()

    device.close()
    retstr.close()

    return text
コード例 #11
0
ファイル: MyPdfMiner.py プロジェクト: i11uminator/bookservice
 def ParseAllPages(self, filepath):
     # Open a PDF file.
     self.filepath = filepath
     fp = open(filepath, 'rb')
     # Create a PDF parser object associated with the file object.
     parser = PDFParser(fp)
     # Create a PDF document object that stores the document structure.
     doc = PDFDocument()
     # Connect the parser and document objects.
     parser.set_document(doc)
     doc.set_parser(parser)
     # Supply the password for initialization.
     # (If no password is set, give an empty string.)
     password = ""
     doc.initialize(password)
     # Check if the document allows text extraction. If not, abort.
     if not doc.is_extractable:
         raise PDFTextExtractionNotAllowed
     # Create a PDF resource manager object that stores shared resources.
     rsrcmgr = PDFResourceManager()
     # Create a PDF device object.
     device = PDFDevice(rsrcmgr)
     # Create a PDF interpreter object.
     interpreter = PDFPageInterpreter(rsrcmgr, device)
     # Process each page contained in the document.
     for page in doc.get_pages():
         interpreter.process_page(page)
コード例 #12
0
def parse_pdf(pdf_url):

    remote_file = urllib.request.urlopen(pdf_url).read()
    memory_file = io.BytesIO(remote_file)
    parser = PDFParser(memory_file)
    doc = PDFDocument()
    parser.set_document(doc)
    #Warning sometimes, error in pdf?
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    ret = []
    # Process each page contained in the document.
    for pageIdx, page in enumerate(doc.get_pages()):
        ret.append([])
        interpreter.process_page(page)
        layout = device.get_result()
        for idx, lt_obj in enumerate(layout):
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                if len(lt_obj.get_text().strip()) > 0:
                    ret[pageIdx].append((lt_obj.get_text().splitlines()))
    return ret
コード例 #13
0
ファイル: dumppdf.py プロジェクト: joshmgrant/pdfminer
def dumpoutline(outfp, fname, objids, pagenos, password='',
                dumpall=False, codec=None):
    doc = PDFDocument()
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)
    pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
    for (level,title,dest,a,se) in doc.get_outlines():
        pageno = None
        if dest:
            dest = resolve1( doc.lookup_name('Dests', dest) )
            if isinstance(dest, dict):
                dest = dest['D']
            pageno = pages[dest[0].objid]
        elif a:
            action = a.resolve()
            if isinstance(action, dict):
                subtype = action.get('S')
                if subtype and repr(subtype) == '/GoTo' and action.get('D'):
                    dest = action['D']
                    pageno = pages[dest[0].objid]
        outfp.write(repr((level,title,dest,pageno))+'\n')
    parser.close()
    fp.close()
    return
コード例 #14
0
ファイル: statement2csv.py プロジェクト: jlas/misc
def pdf_to_csv(filename):
    # ... the following part of the code is a remix of the
    # convert() function in the pdfminer/tools/pdf2text module
    rsrc = PDFResourceManager()
    outfp = StringIO()
    device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())
    # becuase my test documents are utf-8 (note: utf-8 is the default codec)

    doc = PDFDocument()
    fp = open(filename, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')

    interpreter = PDFPageInterpreter(rsrc, device)
    for i, page in enumerate(doc.get_pages()):
        outfp.write("START PAGE %d\n" % i)
        if page is not None:
            interpreter.process_page(page)
        outfp.write("END PAGE %d\n" % i)

    device.close()
    fp.close()

    return outfp.getvalue()
コード例 #15
0
ファイル: pdfInvoiceMiner.py プロジェクト: vinovator/Vinlab
def read_invoice_pdfminer3k(pdfFile):
    fp = open(os.path.join(invoice_path + "\\" + pdfFile), "rb")

    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)

    doc.initialize("")
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()

    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Process each page contained in the document.
    invoice_text = ""
    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                invoice_text += lt_obj.get_text()

    # Extract client info from the string extracted from pdf
    client = extract_info(invoice_text, client_start, client_end)
    print("client :" + client)

    # Extract invoice no from the pdf file name
    invoice_no = extract_info(str(pdfFile), invoice_start, invoice_end)
    print("invoice no :" + invoice_no)

    # Pass the client info and invoice no to the method which writes to excel file
    write_excel(client, invoice_no)
コード例 #16
0
ファイル: iocp.py プロジェクト: sebdraven/ioc_parser
    def parse_pdf_pdfminer(self, f, fpath):
        try:
            laparams = LAParams()
            laparams.all_texts = True  
            rsrcmgr = PDFResourceManager()
            pagenos = set()

            if self.dedup:
                self.dedup_store = set()

            self.handler.print_header(fpath)
            page_num = 0
            parser= PDFParser(f)
            doc = PDFDocument(caching=True)

            parser.set_document(doc)
            doc.set_parser(parser)
            for page in doc.get_pages():
                retstr = StringIO()
                device = TextConverter(rsrcmgr, retstr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                page_num += 1
                interpreter.process_page(page)
                data = retstr.getvalue()
                self.parse_page(fpath, bytes(data,'UTF-8'), page_num)
                retstr.close()
            self.handler.print_footer(fpath)
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            self.handler.print_error(fpath, e)
コード例 #17
0
ファイル: pdf.py プロジェクト: hsoft/pdfmasher
def extract_text_elements_from_pdf(path, j=nulljob):
    """Opens a PDF and extract every element that is text based (LTText).
    """
    fp = open(path, 'rb')
    doc = PDFDocument(caching=True)
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize()
    rsrcmgr = PDFResourceManager()
    laparams = LAParams(all_texts=True, paragraph_indent=5, heuristic_word_margin=True)
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = []
    all_elements = []
    enumerated_pages = list(enumerate(doc.get_pages()))
    progress_msg = "Reading page %i of %i"
    for pageno, page in j.iter_with_progress(enumerated_pages, progress_msg):
        interpreter.process_page(page)
        page_layout = device.get_result()
        pages.append(Page(page_layout.width, page_layout.height))
        textboxes = extract_textboxes(page_layout)
        elements = [create_element(box) for box in textboxes]
        merge_oneletter_elems(elements)
        for i, elem in enumerate(elements):
            elem.page = pageno
            elem.order = i
        all_elements += elements
    return pages, all_elements
コード例 #18
0
ファイル: pdf2text.py プロジェクト: zaim/bukutip
def process_pdf(rsrcmgr, device, fp, pagenums=None, maxpages=100, password=''):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument()
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.set_parser(parser)
    # Supply the document password for initialization.
    # (If no password is set, give an empty string.)
    doc.initialize(password)
    # Check if the document allows text extraction. If not, abort.
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    pages = dict(enumerate(doc.get_pages()))
    for num, page in pages.iteritems():
        if pagenums and (num not in pagenums):
            continue
        interpreter.process_page(page)
        if maxpages and maxpages <= num + 1:
            break
    return pages
コード例 #19
0
class PDFController(object):
    def __init__(self, fd=None, password=''):
        self.fd = fd
        self.password = password
        self.parsed = False
        self.document = PDFDocument()
        self.laparams = LAParams()
        self.rsrcmgr = PDFResourceManager()
        self.device = PDFPageAggregator(self.rsrcmgr, laparams=self.laparams)
        self.layout = []

        if fd:
            self.open(fd, password)

    def open(self, fd, password=''):
        self.password = password
        self.fd = fd if hasattr(fd, 'read') else open(fd)

    def close(self):
        if self.fd:
            self.fd.close()
            self.fd = None
        self.parsed = False

    def parse(self):
        parser = PDFParser(self.fd)
        parser.set_document(self.document)
        self.document.set_parser(parser)
        self.document.initialize(self.password)
        if not self.document.is_extractable:
            self.fd.close()
            raise PDFTextExtractionNotAllowed

        if not self.layout:
            self.layout = self._get_layout()

        self.parsed = True

    def _get_layout(self):
        layout = []
        interpreter = PDFPageInterpreter(self.rsrcmgr, self.device)
        for page in self.document.get_pages():
            interpreter.process_page(page)
            layout = self.device.get_result()
        return layout

    def lookup_term(self, term, ignore_case=True):
        layout_list = list(self.layout)
        indexes = [ i for i, v in enumerate(layout_list)
                    if hasattr(v, 'get_text') and (term.lower() if ignore_case else term) in
                        (v.get_text().lower() if ignore_case else v.get_text()) ]
        return indexes

    def __del__(self):
        self.fd.close()

    def __repr__(self):
        return '<PDFController> %s, %s' % ('Open file "%s"' % self.fd.name if self.fd else 'No file opened',
                                             'not parsed' if not self.parsed else 'parsed')
コード例 #20
0
ファイル: showcells.py プロジェクト: aliounedia/scraptils
def pdf2csv(fp):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument()
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.set_parser(parser)
    # Supply the password for initialization.
    # (If no password is set, give an empty string.)
    doc.initialize('')
    # Check if the document allows text extraction. If not, abort.
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Set parameters for analysis.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    for pageno, page in enumerate(doc.get_pages()):
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        #import code; code.interact(local=locals());
        hlines=[]
        vlines=[]
        for i in layout:
            if not type(i) in (LTRect, LTLine): continue
            hlines.append(int(i.x0))
            hlines.append(int(i.x1))
            vlines.append(int(layout.height - i.y0))
            vlines.append(int(layout.height - i.y1))
        hlines=filterclose(sorted(set(hlines)))
        vlines=filterclose(sorted(set(vlines)))
        print hlines
        print vlines
        print (layout.width, layout.height)
        i=0
        im = Image.new('1', (int(layout.width), int(layout.height)))
        draw = ImageDraw.Draw(im)
        while(i<len(vlines)-1):
            if not vlines[i+1]-vlines[i]>5:
                i=i+1
                continue
            j=0
            while(j<len(hlines)-1):
                if not hlines[j+1]-hlines[j]>5:
                    j=j+1
                    continue
                draw.rectangle([(int(hlines[j]),int(vlines[i])),(int(hlines[j+1]),int(vlines[i+1]))], outline=1)
                j=j+1
            i=i+1
        del draw
        fp=open("out%s.png" % pageno,'wb')
        im.save(fp,"PNG")
        fp.close()
コード例 #21
0
def pdf2txt(pdf_file_name):
    #
    #open the pdf file in read bytes mode
    #
    try:
        fp = open(pdf_file_name , 'rb')
    except Exception as Argument:
        #log the error or warning in logfile
        logging.info("WARNING found while opening the PDF file '" + pdf_file_name + "' of the format Textbox")
        logging.warning(traceback.format_exc())
        return
            
    
    #create a parser object which is associated with the file object
    parser = PDFParser(fp)
    
    #create a PDFDocument objecct that stores the document strcuture
    doc = PDFDocument()
    
    #connect the parser and document objects
    parser.set_document(doc)
    doc.set_parser(parser)
    
    #supply the password here, if the PDF is protected
    try:
        doc.initialize('')
    except Exception as Argument:
        #log the error or warning in logfile
        logging.info("WARNING found while opening the PDF file '" + pdf_file_name + "' of the format Textbox")
        logging.warning(traceback.format_exc())
        return
    
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    
    # Set parameters for analysis.
    laparams = LAParams()
    
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams = laparams)
    
    # Create a PDF interpreter object
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    
    # Process each page contained in the document.
    for page in doc.get_pages():
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        #String to store the entire text
        textEtractedFromTable = ""
        #
        # The text extracted from the PDF file is returned to Main Table Module as a string
        #
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                textEtractedFromTable += (lt_obj.get_text())
        return textEtractedFromTable
コード例 #22
0
ファイル: pdf2csv.py プロジェクト: GustavePate/distark
def pdf2csv(pdf):
    fp = open(pdf, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    # Supply the password for initialization.
    # (If no password is set, give an empty string.)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    # Set parameters for analysis.
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    writer = UnicodeWriter(sys.stdout)
    for pageno, page in enumerate(doc.get_pages()):
        print "traitement page", pageno
        interpreter.process_page(page)
        layout = device.get_result()
        hlines = []
        vlines = []
        print layout
        for i in layout:
            if not type(i) == LTRect:
                continue
            hlines.append(int(i.x0))
            hlines.append(int(i.x1))
            vlines.append(int(layout.height - i.y0))
            vlines.append(int(layout.height - i.y1))

        print hlines
        print vlines

        hlines = filterclose(sorted(set(hlines)))
        vlines = filterclose(sorted(set(vlines)))
        i = 0
        while(i < len(vlines) - 1):
            if not vlines[i + 1] - vlines[i] > 10:
                i = i + 1
                continue
            j = 0
            row = []
            while(j < len(hlines) - 1):
                if not hlines[j + 1] - hlines[j] > 10:
                    j = j + 1
                    continue
                row.append(' '.join(get_region(pdf,
                                               pageno + 1,
                                               hlines[j] + 1,
                                               vlines[i],
                                               hlines[j + 1] - 1,
                                               vlines[i + 1]).split()))
                j = j + 1
            writer.writerow(row)
            i = i + 1
    fp.close()
コード例 #23
0
ファイル: core.py プロジェクト: SkyTruth/fracfocustools
    def parse_pdf (self):
        self.report = Report (self.logger)
        fp = StringIO(self.raw_pdf)
        parser = PDFParser(fp)
        doc = PDFDocument()
        parser.set_document(doc)
        try:
            doc.set_parser(parser)
            doc.initialize('')
            if not doc.is_extractable:
                raise RuntimeError("PDFTextExtractionNotAllowed")

            rsrcmgr = PDFResourceManager()
            laparams = LAParams(
                                char_margin=0.01,       # default 1.0
                                word_margin=0.2,       # default 0.2
                                line_margin=0.3,       # default 0.3
                                line_overlap=0.5       # default 0.5
                               )
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)

            page_y_offset = 0
            pages = []
            for page in doc.get_pages():
                pages.append(page)
            pages.reverse()

            # Detect pdf format
            for page in pages:
                interpreter.process_page(page)
                layout = device.get_result()
                if self.find_pdf_text(layout, "Job Start Date:"):
                    sheet = Sheet2()
                    break
            else:
                sheet = Sheet1()
                #sheet = Sheet2()  #  TESTING

            for page in pages:
                interpreter.process_page(page)
                layout = device.get_result()
                sheet.add_ltcontainer (layout, page_y_offset)
                page_y_offset += layout.y1

            self.report.extract_data (sheet)
        except Exception:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            trace = traceback.format_exception (exc_type, exc_value, exc_traceback)
            self.logger.error('%s'%''.join(trace))

        if self.logger.has_error():
            return None
        else:
            return self.report
コード例 #24
0
ファイル: pdf.py プロジェクト: oubiwann/tharsk
class PDFScraper(object):
    """
    """

    converterClass = TabbedConverter

    def __init__(self, filename, skipStartsWith=None, skipIn=None):
        self.filename = filename
        rsrc = PDFResourceManager()
        self.outfp = StringIO()
        self.converter = self.converterClass(
            rsrc,
            self.outfp,
            codec="utf-8",
            laparams=LAParams(),
            skip_startswith=skipStartsWith or [],
            skip_in=skipIn or [],
            isLineStart=self.isLineStart,
            cleanTerm=self.cleanTerm,
            preProcessLine=self.preProcessLine,
        )
        self.interpreter = PDFPageInterpreter(rsrc, self.converter)

    def isLineStart(self, line):
        return False

    def cleanTerm(self, line):
        return line

    def preProcessLine(self, line):
        return line

    def prepare(self):
        self.doc = PDFDocument()
        self.source = open(self.filename, "rb")
        parser = PDFParser(self.source)
        parser.set_document(self.doc)
        self.doc.set_parser(parser)
        self.doc.initialize("")

    def finish(self):
        self.converter.close()
        self.source.close()

    def postProcess(self):
        return self.outfp.getvalue()

    def run(self):
        self.prepare()
        # for i, page in enumerate(list(self.doc.get_pages())[0:1]):
        for i, page in enumerate(self.doc.get_pages()):
            if page is not None:
                self.interpreter.process_page(page)
        self.finish()
        return self.postProcess()
コード例 #25
0
ファイル: pdfutils.py プロジェクト: emulbreh/ecs
def pdf_page_count(filelike):
    ''' returns number of pages of an pdf document '''
    filelike.seek(0)
    parser = PDFParser(filelike)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    pages = sum(1 for _ in doc.get_pages())
    filelike.seek(0)
    return pages
コード例 #26
0
ファイル: classes.py プロジェクト: StumpyFrostreaver/slate
class PDF(list):
    def __init__(self, file, password='', just_text=1, check_extractable=True, char_margin=1.0, line_margin=0.1, word_margin=0.1):
        self.parser = PDFParser(file)
        self.laparams = LAParams(char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)

        if PYTHON_3:
            self.doc = PDFDocument()
            self.parser.set_document(self.doc)
            self.doc.set_parser(self.parser)
            self.doc.initialize(password)
        else:
            self.doc = PDFDocument(self.parser, password)

        if not check_extractable or self.doc.is_extractable:
            self.resmgr = PDFResourceManager()
            self.device = TextConverter(self.resmgr, outfp=StringIO(), laparams=self.laparams)
            self.interpreter = PDFPageInterpreter(
               self.resmgr, self.device)

            if PYTHON_3:
                page_generator = self.doc.get_pages()
            else:
                page_generator = PDFPage.create_pages(self.doc)

            for page in page_generator:
                self.append(self.interpreter.process_page(page))
            self.metadata = self.doc.info
        if just_text:
            self._cleanup()

    def _cleanup(self):
        """
        Frees lots of non-textual information, such as the fonts
        and images and the objects that were needed to parse the
        PDF.
        """
        self.device = None
        self.doc = None
        self.parser = None
        self.resmgr = None
        self.interpreter = None

    def text(self, clean=True):
        """
        Returns the text of the PDF as a single string.
        Options:

          :clean:
            Removes misc cruft, like lots of whitespace.
        """
        if clean:
            return utils.normalise_whitespace(''.join(self).replace('\n', ' '))
        else:
            return ''.join(self)
コード例 #27
0
def getTableOfContents (path, pageNum):
    fp = open(path, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    #doc.initialize(password)

    for pageNumber, page in enumerate(doc.get_pages()):
        if pageNumber == pageNum:
            return getParsedPage(doc, pageNum)
コード例 #28
0
ファイル: pdfToText.py プロジェクト: cdsulliv/ResumeParsing
def pdf_to_text(filename):
    from cStringIO import StringIO  
    from pdfminer.converter import LTChar, TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfparser import PDFDocument, PDFParser
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter

    class Converter(TextConverter):
        def __init__(self, *args, **kwargs):
            TextConverter.__init__(self, *args, **kwargs)

        def end_page(self, i):
            from collections import defaultdict
            lines = defaultdict(lambda : {})
            for child in self.cur_item._objs:                #<-- changed
                if isinstance(child, LTChar):
                    (_,_,x,y) = child.bbox                   
                    line = lines[int(-y)]
                    line[x] = child._text.encode(self.codec) #<-- changed

            for y in sorted(lines.keys()):
                line = lines[y]
                a = "".join(line[x] for x in sorted(line.keys()))
                self.outfp.write("".join(line[x] for x in sorted(line.keys())))
                self.outfp.write("\n")

            return a     
    # ... the following part of the code is a remix of the 
    # convert() function in the pdfminer/tools/pdf2text module
    rsrc = PDFResourceManager()
    outfp = StringIO()
    device = Converter(rsrc, outfp, codec="utf-8", laparams=LAParams())
        # becuase my test documents are utf-8 (note: utf-8 is the default codec)

    doc = PDFDocument()
    fp = open(filename, 'rb')
    parser = PDFParser(fp)       
    parser.set_document(doc)     
    doc.set_parser(parser)       
    doc.initialize('')

    interpreter = PDFPageInterpreter(rsrc, device)

    for i, page in enumerate(doc.get_pages()):
        #outfp.write("START PAGE %d\n" % i)
        if page is not None:
            interpreter.process_page(page)
        #outfp.write("END PAGE %d\n" % i)

    device.close()
    fp.close()

    return outfp.getvalue()
コード例 #29
0
ファイル: ParseMalaysiaAstro.py プロジェクト: n00de/parsers
 def get_pdf_num_page(self, pdf):
     """
         Get count page
     """
     if os.path.exists(pdf):
         fp = open(pdf, 'rb')
         parser = PDFParser(fp)
         doc = PDFDocument()
         parser.set_document(doc)
         doc.set_parser(parser)
         count_page = [i for i in doc.get_pages()]
         return len(count_page)
コード例 #30
0
ファイル: update.py プロジェクト: yuecen/krtc
def data_extraction(filename):
    from pdfminer.pdfparser import PDFParser, PDFDocument
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.converter import PDFPageAggregator
    from pdfminer.layout import LAParams, LTTextBoxHorizontal

    doc = PDFDocument()
    parser = PDFParser(open(filename + '.pdf', 'rb'))
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize()

    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed

    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    data_cols = {}
    date, day, red_line_people, orange_line_people, total_people = [], [], [], [], []
    for i, page in enumerate(doc.get_pages()):
        interpreter.process_page(page)
        layout = device.get_result()
        for x in layout:
            if type(x) == LTTextBoxHorizontal:
                x = re.sub(r'\n\s*\n', '\n' , x.get_text()).strip()
                first_value = str(x.split('\n')[0]).strip()
                if first_value == '營運日':
                    date = x.split('\n')
                    # print '營運日', date
                if first_value == '星期':
                    day = x.split('\n')
                    # print '星期', day
                if first_value == '紅線運量(人次)':
                    red_line_people = [v.strip() for v in x.replace(',','').split('\n')]
                    # print '紅線運量(人次)', red_line_people
                if first_value == '橘線運量(人次)':
                    orange_line_people = [v.strip() for v in x.replace(',','').split('\n')]
                    # print '橘線運量(人次)', orange_line_people
                if first_value == '總運量(人次)':
                    total_people = [v.strip() for v in x.replace(',','').split('\n')]
                    # print '總運量(人次)', total_people
    data_cols = {'date': date,
                 'day': day,
                 'red_line_people': red_line_people,
                 'orange_line_people': orange_line_people,
                 'total_people': total_people}

    return data_cols
コード例 #31
0
def parseCRF(strFilePath, strFileName, boolControlVersion, strVersion):

    # print("Start parse CRF");
    # fp = open(strSysPath+"/CRF/"+strFileName,'rb');
    fp = open(strFilePath + "\\" + strFileName, 'rb')

    parser = PDFParser(fp)

    doc = PDFDocument()

    parser.set_document(doc)
    doc.set_parser(parser)

    doc.initialize()

    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:

        rsrcmgr = PDFResourceManager()

        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)

        interpreter = PDFPageInterpreter(rsrcmgr, device)

        count = 0
        listCRF = list()
        for page in doc.get_pages():
            interpreter.process_page(page)

            layout = device.get_result()
            # strForm="";

            objPageQuestions = createPageQuestions(layout, count + 1,
                                                   boolControlVersion,
                                                   strVersion)

            listCRF.extend(objPageQuestions)

            count = count + 1
            # if count >10:
            #     break;

        df = pd.DataFrame(listCRF)
        # print("End parse CRF");
        return df
コード例 #32
0
class PDF2Word:
    def __init__(self, pdf_path):
        # 以二进制读模式打开 PDF
        fp = open(pdf_path, 'rb')
        # 用文件对象来创建一个 PDF 文档分析器
        parser = PDFParser(fp)
        # 创建一个 PDF 文档
        self.doc = PDFDocument()
        # 连接分析器 与文档对象
        parser.set_document(self.doc)
        self.doc.set_parser(parser)

        # 提供初始化密码
        # 如果没有密码 就创建一个空的字符串
        self.doc.initialize()

    # PDF 转 Word
    def pdf_to_word(self, sve_path):
        # 检测文档是否提供 txt 转换,不提供就忽略
        if not self.doc.is_extractable:
            raise PDFTextExtractionNotAllowed
        else:
            # 创建 PDf 资源管理器 来管理共享资源
            rsrcmgr = PDFResourceManager()
            # 创建一个 PDF 设备对象
            laparams = LAParams()
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            # 创建一个 PDF 解释器对象
            interpreter = PDFPageInterpreter(rsrcmgr, device)

            # 用来计数页面,图片,曲线,figure,水平文本框等对象的数量
            num_page, num_image, num_curve, num_figure, num_TextBoxHorizontal = 0, 0, 0, 0, 0

            # 首先创建一个文档对象
            document = Document()
            # 循环遍历列表,每次处理一个page的内容
            for page in self.doc.get_pages():  # doc.get_pages() 获取page列表
                num_page += 1  # 页面增一
                interpreter.process_page(page)
                # 接受该页面的LTPage对象
                layout = device.get_result()
                for x in layout:
                    if isinstance(x, LTTextBoxHorizontal):  # 获取文本内容
                        results = x.get_text()
                        document.add_paragraph(results)
            document.save(sve_path)
コード例 #33
0
def process(path):
    nega = posi = unce = liti = cons = supe = inte = master = total = 0

    fp = open(path, 'rb')
    praser = PDFParser(fp)
    doc = PDFDocument()
    praser.set_document(doc)
    doc.set_parser(praser)
    doc.initialize()
    fp.close()

    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()

            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    results = x.get_text().lower()
                    list = results.split()
                    total += len(results)
                    for part in list:
                        if count_word(part, word) > 0 :
                            master += 1
                        if count_word(part, negative ):
                            nega += 1
                        if count_word(part, positive):
                            posi += 1
                        if count_word(part, uncertainty):
                            unce += 1
                        if count_word(part, litigious):
                            liti += 1
                        if count_word(part, constraining):
                            cons += 1
                        if count_word(part, superfluous):
                            supe += 1
                        if count_word(part, interesting):
                            inte += 1
    return [master, total, nega, posi, unce, liti, cons, supe, inte]
コード例 #34
0
def readPdf(path, toPath):
    # 以二进制打开文件 pdf
    f = open(path, "rb")

    # 创建一个pdf文档分析器
    parser = PDFParser(f)

    # 创建一个pdf文档
    pdfFile = PDFDocument()

    # 连接文档和分析器
    parser.set_document(pdfFile)

    pdfFile.set_parser(parser)

    # 提供初始化密码
    pdfFile.initialize()

    # 检测pdf文档是否提供txt转换
    if not pdfFile.is_extractable:
        # 不提供
        # 当程序出现错误,python会自动引发异常,也可以通过raise显示地引发异常。
        # 一旦执行了raise语句,raise后面的语句将不能执行
        raise PDFTextExtractionNotAllowed

    else:
        # 提供
        manager = PDFResourceManager()
        # 创建一个pdf设备对象
        laparams = LAParams()
        device = PDFPageAggregator(manager, laparams=laparams)
        # 解锁器对象
        interpreter = PDFPageInterpreter(manager, device)

        # 开始循环处理 每次处理一页
        for page in pdfFile.get_pages():
            # 解释这一页
            interpreter.process_page(page)
            layout = device.get_result()
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    # 打开 toPath文件 并 往里面追加 pdf的内容
                    with open(toPath, "a", encoding="utf-8") as f:
                        str = x.get_text()
                        print(str)
                        f.write(str + "\n")
コード例 #35
0
def parse():
    #rb以二进制读模式打开本地pdf文件
    fn = open('test.pdf', 'rb')
    #创建一个pdf文档分析器
    parser = PDFParser(fn)
    #创建一个PDF文档
    doc = PDFDocument(parser)
    #连接分析器 与文档对象
    parser.set_document(doc)
    doc.set_parser(parser)

    # 提供初始化密码doc.initialize("lianxipython")
    # 如果没有密码 就创建一个空的字符串
    doc.initialize("")
    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed

    else:
        #创建PDf资源管理器
        resource = PDFResourceManager()
        #创建一个PDF参数分析器
        laparams = LAParams()
        #创建聚合器,用于读取文档的对象
        device = PDFPageAggregator(resource, laparams=laparams)
        #创建解释器,对文档编码,解释成Python能够识别的格式
        interpreter = PDFPageInterpreter(resource, device)
        # 循环遍历列表,每次处理一页的内容
        # doc.get_pages() 获取page列表
        for page in doc.get_pages():
            #利用解释器的process_page()方法解析读取单独页数
            interpreter.process_page(page)
            #使用聚合器get_result()方法获取内容
            layout = device.get_result()
            #这里layout是一个LTPage对象,里面存放着这个page解析出的各种对象
            try:
                for out in layout:
                    #判断是否含有get_text()方法,获取我们想要的文字
                    if hasattr(out, "get_text"):
                        print(out.get_text())
                        with open('test.txt', 'a') as f:
                            f.write(out.get_text() + '\n')

            except UnicodeEncodeError as ue:
                print("异常:" + str(ue))
                pass
コード例 #36
0
def get_text_from_pdf(filename):
    from pdfminer.pdfparser import PDFParser, PDFDocument
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.converter import PDFPageAggregator
    from pdfminer.layout import LAParams, LTTextBox
    from pdfminer.pdfinterp import PDFTextExtractionNotAllowed

    path = filename + ".pdf"

    # 用文件对象来创建一个pdf文档分析器
    praser = PDFParser(open(path, 'rb'))
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器 与文档对象
    praser.set_document(doc)
    doc.set_parser(praser)

    # 提供初始化密码
    # 如果没有密码 就创建一个空的字符串
    doc.initialize()

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf 资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # 循环遍历列表,每次处理一个page的内容
        content = ''
        for page in doc.get_pages():
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            # 这里layout是一个LTPage对象,里面存放着这个 page 解析出的各种对象
            # 包括 LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等
            for x in layout:
                if isinstance(x, LTTextBox):
                    # print(x.get_text().strip())
                    content = content + x.get_text().strip()
    return content
コード例 #37
0
def readPDF(path):
    # 以二进制形式打开pdf文件
    f = open(path, "rb")

    # 创建一个pdf文档分析器
    parser = PDFParser(f)

    #创建一个pdf文档
    pdfFile =  PDFDocument()

    #连接分析器和文档对象
    parser.set_document(pdfFile)
    pdfFile.set_parser(parser)

    #提供初始化密码
    pdfFile.initialize()

    #检测文档是否提供txt转换
    if not pdfFile.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        #解析数据
        #数据管理器
        manager = PDFResourceManager()
        #创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(manager, laparams=laparams)
        # 创建一个PDF解释其对象
        interpreter = PDFPageInterpreter(manager, device)

        # 循环遍历列表,每次处理一个page内容
        # pdfFile.get_pages() 获取page列表
        for page in pdfFile.get_pages():
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象
            # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等
            # 想要获取文本就获得对象的text属性,
            for x in layout:
                #判断类型isinstance()x是LTTextBoxHorizontal类型
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open(r'pdf.txt', 'a',encoding='utf-8') as f:
                        str = x.get_text()
                        # print(str)
                        f.write(str + "\n")
コード例 #38
0
ファイル: myselector.py プロジェクト: xfzhu2003/github
    def pdfparse(self, url):
        try:
            if url:
                res = s.get(url, headers={"user-agent": generate_user_agent()})
                res.encoding = 'utf-8'
                f = BytesIO()
                f.write(res.content)
                f.seek(0)
                #                path2 = os.getcwd()+"\\%s.txt"%name.split(".")[0]
                #            print(path1)

                praser = PDFParser(f)
                doc = PDFDocument()
                praser.set_document(doc)
                doc.set_parser(praser)
                doc.initialize()

                if not doc.is_extractable:
                    raise PDFTextExtractionNotAllowed
                else:
                    # 创建PDf 资源管理器 来管理共享资源
                    #                    print("a")
                    rsrcmgr = PDFResourceManager()
                    # 创建一个PDF设备对象
                    laparams = LAParams()
                    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
                    # 创建一个PDF解释器对象
                    interpreter = PDFPageInterpreter(rsrcmgr, device)
                    text = ''
                    # 循环遍历列表,每次处理一个page的内容
                    for page in doc.get_pages():  # doc.get_pages() 获取page列表
                        interpreter.process_page(page)
                        # 接受该页面的LTPage对象
                        layout = device.get_result()
                        #text = "".join(map(lambda x:x.get_text().strip(" ") if x.get_text() else "",layout))
                        #print(text)
                        # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
                        for x in layout:
                            if hasattr(x, 'get_text'):
                                results = x.get_text()
                                if results:
                                    text += results.strip('\n')
                    f.close()
                    return text
        except Exception as e:
            print(e)
コード例 #39
0
ファイル: pdf2txt.py プロジェクト: itismejy/QHacks-2019
def pdf_to_csv(filename):
    class CsvConverter(TextConverter):
        def __init__(self, *args, **kwargs):
            TextConverter.__init__(self, *args, **kwargs)

        def end_page(self, i):
            from collections import defaultdict
            lines = defaultdict(lambda: {})
            for child in self.cur_item._objs:  #<-- changed
                if isinstance(child, LTChar):
                    (_, _, x, y) = child.bbox
                    line = lines[int(-y)]
                    line[x] = child._text.encode(self.codec)  #<-- changed

            for y in sorted(lines.keys()):
                line = lines[y]
                self.outfp.write(";".join(line[x]
                                          for x in sorted(line.keys())))
                self.outfp.write("\n")

    # ... the following part of the code is a remix of the
    # convert() function in the pdfminer/tools/pdf2text module
    rsrc = PDFResourceManager()
    outfp = StringIO()
    device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())
    # becuase my test documents are utf-8 (note: utf-8 is the default codec)

    doc = PDFDocument()
    fp = open(filename, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')

    interpreter = PDFPageInterpreter(rsrc, device)

    for i, page in enumerate(doc.get_pages()):
        outfp.write("START PAGE %d\n" % i)
        if page is not None:
            interpreter.process_page(page)
        outfp.write("END PAGE %d\n" % i)

    device.close()
    fp.close()

    return outfp.getvalue()
コード例 #40
0
def parse():
    # rb以二进制读模式打开本地pdf文件
    fn = open('Django-日志配置.md.pdf','rb')
    # 创建一个pdf文档分析器
    parser = PDFParser(fn)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器 与文档对象
    parser.set_document(doc)
    doc.set_parser(parser)
 
    # 提供初始化密码doc.initialize("lianxipython")
    # 如果没有密码 就创建一个空的字符串
    doc.initialize("")
    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
 
    else:
        # 创建PDf资源管理器
        resource = PDFResourceManager()
        # 创建一个PDF参数分析器
        laparams = LAParams()
        # 创建聚合器,用于读取文档的对象
        device = PDFPageAggregator(resource,laparams=laparams)
        # 创建解释器,对文档编码,解释成Python能够识别的格式
        interpreter = PDFPageInterpreter(resource,device)
        # 循环遍历列表,每次处理一页的内容
        # doc.get_pages() 获取page列表
        for page in doc.get_pages():
            # 利用解释器的process_page()方法解析读取单独页数
            interpreter.process_page(page)
            # 使用聚合器get_result()方法获取内容
            layout = device.get_result()
            # 这里layout是一个LTPage对象,里面存放着这个page解析出的各种对象
            for out in layout:
                # 判断是否含有get_text()方法,获取我们想要的文字
                if hasattr(out,"get_text"):
                    # print(out.get_text(), type(out.get_text()))
                    content = out.get_text().replace(u'\xa0', u' ')  # 将'\xa0'替换成u' '空格,这个\xa0就是&nbps空格
                    # with open('test.txt','a') as f:
                    #     f.write(out.get_text().replace(u'\xa0', u' ')+'\n')
                    document.add_paragraph(
                        content, style='ListBullet'    # 添加段落,样式为unordered list类型
                    )
                document.save('demo1.docx')  # 保存这个文档
コード例 #41
0
ファイル: task1.py プロジェクト: tedinGH/lvyaoyu
def process_pdf(filePath):
    # 二进制读取pdf文件
    fp = open(filePath, 'rb')
    parser = PDFParser(fp)
    # 创建一个PDF文档对象
    doc = PDFDocument()
    # 分析器和文档相互连接

    parser.set_document(doc)
    doc.set_parser(parser)

    # 提供初始化密码
    # 如果没有密码 就创建一个空的字符串
    doc.initialize()

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建一个PDF资源管理器来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr=rsrcmgr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr=rsrcmgr, device=device)

        result = ''

        # 循环遍历列表,每次只处理一个page内容
        for page in doc.get_pages():  # doc.get_pages()获取page列表
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            for x in layout:
                if isinstance(x, LTTextBoxHorizontal):
                    # with open('test.txt', 'a') as f:
                    #     result = x.get_text()
                    #     print(result)
                    #     f.write(result + '\n')
                    result += x.get_text()
        try:
            invoice_no = re.search("INVOICE NO. (\d+)", result)
            return invoice_no.group(1)
        except:
            return None
コード例 #42
0
ファイル: dumppdf.py プロジェクト: authetic-x/Web_Scraping
def dumpoutline(outfp, fname, objids, pagenos, password='',
                dumpall=False, codec=None):
    doc = PDFDocument()
    fp = open(fname, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)
    pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
    def resolve_dest(dest):
        if isinstance(dest, str):
            dest = resolve1(doc.get_dest(dest))
        elif isinstance(dest, PSLiteral):
            dest = resolve1(doc.get_dest(dest.name))
        if isinstance(dest, dict):
            dest = dest['D']
        return dest
    try:
        outlines = doc.get_outlines()
        outfp.write('<outlines>\n')
        for (level,title,dest,a,se) in outlines:
            pageno = None
            if dest:
                dest = resolve_dest(dest)
                pageno = pages[dest[0].objid]
            elif a:
                action = a.resolve()
                if isinstance(action, dict):
                    subtype = action.get('S')
                    if subtype and repr(subtype) == '/GoTo' and action.get('D'):
                        dest = resolve_dest(action['D'])
                        pageno = pages[dest[0].objid]
            s = e(title).encode('utf-8', 'xmlcharrefreplace')
            outfp.write('<outline level="%r" title="%s">\n' % (level, s))
            if dest is not None:
                outfp.write('<dest>')
                dumpxml(outfp, dest)
                outfp.write('</dest>\n')
            if pageno is not None:
                outfp.write('<pageno>%r</pageno>\n' % pageno)
            outfp.write('</outline>\n')
        outfp.write('</outlines>\n')
    except PDFNoOutlines:
        pass
    parser.close()
    fp.close()
コード例 #43
0
ファイル: util.py プロジェクト: nefi10/pdfminer3k-1
def pages_from_pdf(path, **laparams):
    fp = open(path, 'rb')
    doc = PDFDocument(caching=True)
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize()
    rsrcmgr = PDFResourceManager()
    laparams = LAParams(all_texts=True, **laparams)
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    result = []
    for page in doc.get_pages():
        interpreter.process_page(page)
        page_layout = device.get_result()
        result.append(page_layout)
    return result
コード例 #44
0
def pdf2txt(pdfname, txtname):
    btxt = False
    try:
        fp = open(pdfname, 'rb')
        parser = PDFParser(fp)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()

        laparams.char_margin = 1.0
        laparams.word_margin = 1.0
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        ncount = 0
        print("pdf2txt %s..." %
              pdfname)  # informa por consola del nombre de archivo

        # abre archivo de texto para la salida
        fptxt = open(txtname, 'w')
        # recorre el documento procesando cada página
        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            # recorre la página procesando cada objeto
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBox) or isinstance(
                        lt_obj, LTTextLine):
                    spagetxt = lt_obj.get_text().strip() + " "
                    if (spagetxt != ""):
                        btxt = True
                        fptxt.write(spagetxt)
                        print("Palabra", spagetxt)
                elif isinstance(lt_obj, LTFigure):
                    print("LTFigure, pte implementar!")
                    spagetxt = ""
            ncount += 1

        print("end")
        fptxt.closed
        fp.closed
    except Exception as e:
        print("Error: %s" % (e))
    return btxt
コード例 #45
0
def parser_pdf_file(pdf_file_path):
    d_count = 0
    x_count = 0
    word_count = 0
    read_pdf = open(pdf_file_path, 'rb')  # 打开PDF文件。
    parser_pdf = PDFParser(read_pdf)  # 用文件对象创建一个PDF文档分析器。
    pdf_document = PDFDocument(parser_pdf)  # 创建一个PDF文档。

    parser_pdf.set_document(pdf_document)
    pdf_document.set_parser(parser_pdf)  # 连接分析器 与文档对象。
    pdf_document.initialize()  # 如果没有密码,就创建一个空的字符串。

    if not pdf_document.is_extractable:  # 检测文档是否提供txt转换,不提供就忽略。
        raise PDFTextExtractionNotAllowed
    else:
        pdf_manager = PDFResourceManager()  # 创建PDF资源管理器 来管理共享资源。
        pdf_laparams = LAParams()  # 创建一个PDF参数分析器。
        pdf_device = PDFPageAggregator(pdf_manager,
                                       laparams=pdf_laparams)  # 创建一个聚合器
        pdf_interpreter = PDFPageInterpreter(pdf_manager,
                                             pdf_device)  # 创建一个PDF页面解释器对象
        # 循环遍历列表,每次处理一页的内容,pdf_document.get_pages()获取page列表
        for each_page in pdf_document.get_pages():
            pdf_interpreter.process_page(each_page)  # 使用页面解释器来读取
            layout = pdf_device.get_result(
            )  # 这里layout是一个LTPage对象 里面存放着这个page解析出的各种对象 一般包括LTTexBox,LTFigure,LTImage,
            # LTTexBoxHorizontal等等 想要获取文本就获得对象的text属性。
            # print(layout)
            for each_info in layout:
                if isinstance(each_info, LTTextBoxHorizontal):
                    result = each_info.get_text().strip()
                    d_match = d_pattern.findall(result)
                    x_match = x_pattern.findall(result)

                    word_count += len(result)

                    if d_match:
                        d_count += 1
                    if x_match:
                        x_count += 1
                    print(result)
                    print("======")
        if d_count == 0:
            return x_count, word_count
        else:
            return d_count, word_count
コード例 #46
0
def process_pdf(filePath):
    # 二进制读取pdf文件
    fp = open(filePath, 'rb')
    parser = PDFParser(fp)
    # 创建一个PDF文档对象
    doc = PDFDocument()
    # 分析器和文档相互连接
    parser.set_document(doc)
    doc.set_parser(parser)
    # 提供初始化密码
    # 如果没有密码 就创建一个空的字符串
    doc.initialize()
    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建一个PDF资源管理器来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr=rsrcmgr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr=rsrcmgr, device=device)
        result = ''
        # 循环遍历列表,每次只处理一个page内容
        for page in doc.get_pages():  # doc.get_pages()获取page列表
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            for x in layout:
                if isinstance(x, LTTextBoxHorizontal):
                    result += x.get_text()
        text = re.search("===================\n(.*\n)+ TOTAL INTERNATIONAL",
                         result)
        result = text.group()
        data = re.findall(' (\d+.+?\d+)\n', result)
        print(len(data))
        rows = []
        for i in data:
            row = re.findall('\S+', i)
            # print(row)
            rows.append(row)
        df = DataFrame(data=rows)
        for i in df.columns[3:13]:
            df[i] = df[i].astype('float64')
        return df
コード例 #47
0
def readPdf(self, path, callback=None, topath = ""):
    #以二进制方式打开pdf文件
    f = open(path, "rb")

    #创建一个pdf 文档分析器
    parser = PDFParser(f)

    #创建pdf文档
    pdfFile = PDFDocument()

    #连接分析器与文档对象
    parser.set_document(pdfFile)
    #pdf 连接解析器反向关联
    pdfFile.set_parser(parser)

    #提供初始化密码
    pdfFile.initialize("")

    #检测文档是否提供txt转换
    if not pdfFile.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        #解析数据
        manage = PDFResourceManager()
        #创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(manage, laparams = laparams)
        #解释器对象
        interpreter = PDFPageInterpreter(manage, device)
        #开始处理,没次处理一页
        for page in pdfFile.get_pages():
            interpreter.progcess_page(page)
            layout = device.get_reault()
            for x in layout:
                if(isinstance(x, LTTextBoxHorizontal)):

                    if toPath =="":
                        #处理行数据
                        str = x.get_text()
                        if callback !=None:
                            #回调函数   main 方法
                            callback(str)
                        else:
                            print("处理文件")
                    else:
                        print("写文件   toPath 写入文件的路径")
コード例 #48
0
ファイル: import_much.py プロジェクト: cocpy/mrword
def pdf_read(file):
    """读取pdf文件"""

    # pip install pdfminer3k
    from pdfminer.pdfparser import PDFParser, PDFDocument
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.converter import PDFPageAggregator
    from pdfminer.layout import LTTextBoxHorizontal, LAParams
    from pdfminer.pdfinterp import PDFTextExtractionNotAllowed

    fp = open(file, 'rb')
    # 用文件对象创建一个PDF文档分析器
    parser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器,与文档对象
    parser.set_document(doc)
    doc.set_parser(parser)

    # 提供初始化密码,如果没有密码,就创建一个空的字符串
    doc.initialize()
    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDF,资源管理器,来共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释其对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # 循环遍历列表,每次处理一个page内容
        # doc.get_pages() 获取page列表
        for page in doc.get_pages():
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象
            # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等
            # 想要获取文本就获得对象的text属性,
            for x in layout:
                if isinstance(x, LTTextBoxHorizontal):
                    results = x.get_text()
                    common_handle(results)
コード例 #49
0
def pdf_page_content(pdf_file, page_num):
    '''
    返回某个页面的所有layout,
    :param pdf_file: pdf文件名
    :param page_num: 页数
    :return: Generator, layout
    '''
    if page_num <= 0:
        raise ValueError(
            'page_num must be more than zero, but the number your given is %s'
            % page_num)
    fp = open(pdf_file, 'rb')
    parser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 分析器和文档相互连接
    parser.set_document(doc)
    doc.set_parser(parser)
    # 提供初始化密码,没有默认为空
    doc.initialize()
    # 检查文档是否可以转成TXT,如果不可以就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDF资源管理器,来管理共享资源
        rsrcmagr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        # 将资源管理器和设备对象聚合
        device = PDFPageAggregator(rsrcmagr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmagr, device)
        for index, page in enumerate(doc.get_pages()):
            if index != page_num - 1:
                continue
            else:
                interpreter.process_page(page)
                # 接收该页面的LTPage对象
                layout = device.get_result()
                # 这里的layout是一个LTPage对象 里面存放着page解析出来的各种对象
                # 一般包括LTTextBox,LTFigure,LTImage,LTTextBoxHorizontal等等一些对像
                # 想要获取文本就得获取对象的text属性
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    yield x
コード例 #50
0
def parse():

    '''解析PDF文本,并保存到TXT文件中'''
    fp = open(text_path,'rb')
    #用文件对象创建一个PDF文档分析器
    parser = PDFParser(fp)
    #创建一个PDF文档
    doc = PDFDocument()
    #连接分析器,与文档对象
    parser.set_document(doc)
    doc.set_parser(parser)

    #提供初始化密码,如果没有密码,就创建一个空的字符串
    doc.initialize()

    #检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed

    else:
        #创建PDF,资源管理器,来共享资源
        rsrcmgr = PDFResourceManager()

        #创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr,laparams=laparams)

        #创建一个PDF解释其对象
        interpreter = PDFPageInterpreter(rsrcmgr,device)

        #循环遍历列表,每次处理一个page内容
        page = list(doc.get_pages())[47]
        
        interpreter.process_page(page)
        #接受该页面的LTPage对象
        layout = device.get_result()
        # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象
        # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等
        # 想要获取文本就获得对象的text属性,
        for x in layout:

            if(isinstance(x,LTTextBoxHorizontal)):
                results = x.get_text()
                if(results.strip()):
                   print(results.strip())
コード例 #51
0
    def _pdf_to_text(self, pdf_path, text_path):
        """
        This method does the actual text extraction. It uses PdfMiner Python
        library to do the extraction.
        :param pdf_path: path to the input PDF
        :param text_path: path to the output text
        :return: True if successful, False otherwise
        """
        text = ''
        num_pages = 0
        doc = PDFDocument()
        res_mgr = PDFResourceManager()

        device = PDFPageAggregator(res_mgr, laparams=LAParams())
        interpreter = PDFPageInterpreter(res_mgr, device)

        try:
            with open(pdf_path, 'rb') as fp:
                parser = PDFParser(fp)
                parser.set_document(doc)
                doc.set_parser(parser)
                doc.initialize('')
                for page in doc.get_pages():
                    self._logger.debug('Processing page {}'.format(num_pages +
                                                                   1))
                    interpreter.process_page(page)
                    layout = device.get_result()
                    for lt_obj in layout:
                        if isinstance(lt_obj, LTTextBox) \
                                or isinstance(lt_obj, LTTextLine):
                            # print(lt_obj.get_text())
                            text += lt_obj.get_text()
                    num_pages += 1
                self._logger.info('Done, extracted {} pages'.format(num_pages))
                self._logger.debug('Storing result in {}'.format(text_path))
                with open(text_path, 'w') as text_fp:
                    text_fp.write(text.strip())
        except:
            self._logger.warning(
                'Extracting text from {} failed'.format(pdf_path))
            return False
        finally:
            # close resources before exiting
            device.close()
        return text is not None and len(text)
コード例 #52
0
ファイル: transURL.py プロジェクト: vivian219/interSeeker
def parse_pdf(url):
    """
    从pdf中抽取内容
    :param filename: 要抽取的 pdf路径
    :return: 抽取到的pdf的内容
    """
    get_pdf(url)
    fp = open('1.pdf', 'rb') # 以二进制读模式打开
    #用文件对象来创建一个pdf文档分析器
    praser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器 与文档对象
    praser.set_document(doc)
    doc.set_parser(praser)

    content = ""

    # 提供初始化密码
    # 如果没有密码 就创建一个空的字符串
    doc.initialize()

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        content = ""
    else:
        # 创建PDf 资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # 循环遍历列表,每次处理一个page的内容
        for page in doc.get_pages(): # doc.get_pages() 获取page列表
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    results = x.get_text()
                    content += results
    content = " ".join(content.replace("\n", "").strip().split())
    return content
コード例 #53
0
def onePdfToTxt(filepath, outpath):
    try:
        #rb以二进制读模式打开本地pdf文件
        fp = open(filepath, 'rb')
        outfp = open(outpath, 'w', encoding='utf-8')
        #创建一个pdf文档分析器
        parser = PDFParser(fp)
        #创建一个PDF文档
        doc = PDFDocument()
        #连接分析器 与文档对象
        parser.set_document(doc)
        doc.set_parser(parser)
        # 提供初始化密码doc.initialize("lianxipython")
        # 如果没有密码 就创建一个空的字符串
        doc.initialize("")
        # 检测文档是否提供txt转换,不提供就忽略
        if not doc.is_extractable:
            raise PDFTextExtractionNotAllowed

        else:
            #创建PDf资源管理器
            resource = PDFResourceManager()
            #创建一个PDF参数分析器
            laparams = LAParams()
            #创建聚合器,用于读取文档的对象
            device = PDFPageAggregator(resource, laparams=laparams)
            #创建解释器,对文档编码,解释成Python能够识别的格式
            interpreter = PDFPageInterpreter(resource, device)
            # 循环遍历列表,每次处理一页的内容 doc.get_pages() 获取page列表
            for page in doc.get_pages():
                #利用解释器的process_page()方法解析读取单独页数
                interpreter.process_page(page)
                #使用聚合器get_result()方法获取内容
                layout = device.get_result()
                #这里layout是一个LTPage对象,里面存放着这个page解析出的各种对象
                for out in layout:
                    #判断是否含有get_text()方法,获取我们想要的文字
                    if hasattr(out, "get_text"):
                        text = out.get_text()
                        print(text)
                        outfp.write(text + '\n')
            fp.close()
            outfp.close()
    except Exception as e:
        print(e)
コード例 #54
0
def parse(pdf_path):
    fp = open(pdf_path, 'rb')  # 以二进制读模式打开
    # 用文件对象来创建一个pdf文档分析器
    parser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器 与文档对象
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize()
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # 用来计数页面,图片,曲线,figure,水平文本框等对象的数量
        num_page = 0
        text0_now, text0_last, text1_now, text1_last = "", "", "", ""
        page_list = []
        for page in doc.get_pages():  # doc.get_pages() 获取page列表
            num_page += 1  # 页面增一
            interpreter.process_page(page)
            layout = device.get_result()
            for x in layout:
                if isinstance(x, LTTextBoxHorizontal):  # 获取文本内容
                    if num_page == 1:
                        if x.index == 0:
                            text0_now = x.get_text()
                        if x.index == 1:
                            text1_now = x.get_text()
                    else:
                        if x.index == 0:
                            text0_last = text0_now
                            text0_now = x.get_text()
                        if x.index == 1:
                            text1_last = text1_now
                            text1_now = x.get_text()
            if num_page != 1:
                if text1_now != text1_last:
                    page_list.append(num_page - 1)  # last page
                if text1_now == text0_last:
                    page_list.append(num_page)  # now page
        return page_list
コード例 #55
0
def readPdf(path, toPath):
    #以二进制形式打开pdf文件
    f = open(path, "rb")

    #创建一个pdf分析器
    parser = PDFParser(f)

    #创建pdf文档
    pdfFile = PDFDocument()

    #链接分析器与文件分析器
    parser.set_document(pdfFile)
    #提供初始化密码
    pdfFile.initialize()  #现在是无密码状态

    pdfFile.set_parser(parser)
    #检测文档是否提供txt转换
    if not pdfFile.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        #解析数据
        #数据管理器
        manager = PDFResourceManager()
        #创建一个PDF设备对象
        laparams = LAParams()
        #创建聚合器
        device = PDFPageAggregator(manager, laparams=laparams)
        #解释器对象
        interpreter = PDFPageInterpreter(manager, device)

        #开始循环处理,每次处理一页
        for page in pdfFile.get_pages():
            #使用页面解释器来获取
            # PDFPageInterpreter.process_page(page)
            interpreter.process_page(page)
            #使用聚合器获取内容
            layout = device.get_result()
            for x in layout:
                #判断x是否是LTTextBoxHorizontal类型
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open(toPath, "a") as f:  #以追加的形式写入pdf
                        #
                        str1 = x.get_text()
                        print(str1)
                        f.write(str1 + "\n")
コード例 #56
0
ファイル: pdf_converter.py プロジェクト: gajanlee/paper-kit
def get_pdf_content(pdf_path):
    print(pdf_path)
    with open(pdf_path, "rb") as file:
        parser = PDFParser(file)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)

        doc.initialize()

        # When the file can't convert to txt, it will throw an error
        if not doc.is_extractable:
            raise PDFTextExtractionNotAllowed

        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)

        interpreter = PDFPageInterpreter(rsrcmgr, device)

        content_recoder = []
        # process by page
        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()

            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    content_recoder.append(replace_invisible(x.get_text()))

            # Page procession done, we need detect page number and remove it
            try:
                if content_recoder[-1].isdigit():
                    content_recoder = content_recoder[:-1] + [" "]
            except:
                pass
        if content_recoder:
            content_recoder = list(
                filter(lambda c: c and c != content_recoder[0],
                       content_recoder))
        else:
            pass
            #raise Exception("No Extracted Content.")

        return "\n".join(content_recoder)
コード例 #57
0
    def convert(infile):
        parser = PDFParser(infile)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')
        rsrcmgr = PDFResourceManager()
        device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        extracted_text = ''

        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                    extracted_text += lt_obj.get_text()
        return extracted_text
コード例 #58
0
def parse():
    '''解析PDF文本, 并保存到TXT文件中'''
    fp = open(text_path, 'rb')
    # 用文件对象作为参数创建一个PDF文档分析器
    # 用于解析PDF文件,从文件中获取数据
    parser = PDFParser(fp)
    # 创建一个pdf文档对象
    # 用于将数据存储到内存中
    doc = PDFDocument()
    # 连接分析器与文档对象
    # 建立连接之后呢,我们就可以通过分析器来访问原先的pdf了
    # 而新建的pdf doc我的理解是相当于原先的pdf的镜像
    parser.set_document(doc)
    doc.set_parser(parser)

    # 提供初始化密码,如果没有密码,就创建一个空的字符串
    doc.initialize()

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDF资源管理器来共享资源
        rm = PDFResourceManager()
        # 创建PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rm, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rm, device)

        # 循环遍历列表,每次处理一个page内容
        # doc.get_pages()获取page列表
        for page in doc.get_pages():
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            # 这里的layout是一个LTPage对象,里面存放着这个page解析出的各种对象
            # 想要获取文本就获得对象的text属性,
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open(r'new.txt', 'a') as f:
                        results = x.get_text()
                        print(results)
                        f.write(results)
コード例 #59
0
ファイル: pdf2txt.py プロジェクト: lileieiei/-
def parse(path,name):
    fp = open(path, 'rb') # 以二进制读模式打开
    #用文件对象来创建一个pdf文档分析器
    praser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器 与文档对象
    praser.set_document(doc)
    doc.set_parser(praser)

    # 提供初始化密码
    # 如果没有密码 就创建一个空的字符串
    doc.initialize()

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf 资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # 循环遍历列表,每次处理一个page的内容
        for page in doc.get_pages(): # doc.get_pages() 获取page列表
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    name_txt=name+".txt"
                    outpath=os.path.join(r"C:\Users\Administrator\Desktop\mission\indexnu\doing\txt",name_txt)
                    with open(outpath, 'a',errors="ignore") as f:
                        results = x.get_text()
                        print(results)


                        f.write(results)

                    f.close()
コード例 #60
0
    def extract_papaername(self, path):
        title = ''
        contents = []
        fp = open(path, 'rb')
        praser = PDFParser(fp)
        doc = PDFDocument()
        praser.set_document(doc)
        doc.set_parser(praser)
        doc.initialize()
        if not doc.is_extractable:
            raise PDFTextExtractionNotAllowed
        else:
            rsrcmgr = PDFResourceManager()
            laparams = LAParams()
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in doc.get_pages():
                try:
                    interpreter.process_page(page)
                    layout = device.get_result()
                    for x in layout:
                        if (isinstance(x, LTTextBoxHorizontal)):
                            content = x.get_text().replace('\n', '')
                            contents.append(content)
                except Exception as e:
                    print(e)
                    print('document error...')
        if not contents:
            return
        else:
            id_index = 11
            for indx, line in enumerate(contents[:10]):
                if '文章编号' in line:
                    id_index = indx
                    break
            if id_index == 11:
                title_indx = 0
            else:
                title_indx = id_index + 1
            title = contents[:10][title_indx]
            if len(title.replace(' ', '')) < 4:
                title = contents[:10][title_indx + 1]

        return title.replace('\uf02a', '').replace('*', '')