コード例 #1
1
def getPageLayouts(f1):
    '''Takes a pdf file object, f1, extracts the text-like objects, and returns'''
    try:
        '''The parser and doc pair for a "pipe" of sorts'''
        with open(fpath, 'rb') as f1:
            parser = PDFParser(f1)
            doc = PDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
            doc.initialize(pss_wd)

            # can we extract text?
            if doc.is_extractable:
                rsrcmgr = PDFResourceManager()
                laparams = LAParams()
                device = PDFPageAggregator(rsrcmgr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)

                page_layouts = []
                for page in doc.get_pages():
                    '''
                    I *think* we're actually calling on fp here, and not some
                    stored data; the idea is that .pdf files are "too big and
                    complicated" to load all at once, so why not just parse
                    what you need when you need it?
                    '''
                    interpreter.process_page(page)
                    # receive the LTPage object for the page
                    page_layouts.append(device.get_result())
    except IOError:
        raise IOError, "issue with loading file, please try again"
    finally:
        f1.close()
        return page_layouts
コード例 #2
0
def parsePDFtoTXT(pdf_path):
    fp = open(pdf_path, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument()
    parser.set_document(document)
    document.set_parser(parser)
    document.initialize()
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in document.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            print(layout)
            output = str(layout)
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    text = x.get_text()
                    output += text
            with open('pdfoutput.txt', 'a', encoding='utf-8') as f:
                f.write(output)
コード例 #3
0
 def readPdf(self, path, callback=None, toPath=""):
     f = open(path, "rb")
     parser = PDFParser(f)
     pdfFile = PDFDocument()
     parser.set_document(pdfFile)
     pdfFile.initialize()
     pdfFile.set_parser(parser)
     if not pdfFile.is_extractable:
         raise PDFTextExtractionNotAllowed
     else:
         manager = PDFResourceManager()
         laparams = LAParams()
         device = PDFPageAggregator(manager, laparams=laparams)
         interpreter = PDFPageInterpreter(manager, device)
         for page in pdfFile.get_pages():
             interpreter.process_page(page)
             layout = device.get_result()
             for x in layout:
                 if (isinstance(x, LTTextBoxHorizontal)):
                     if toPath == "":
                         #处理每行数据
                         str = x.get_text()
                         if callback != None:
                             callback(str)  #当做函数运行
                         print(str)
                     else:
                         with open(toPath, "a") as f:
                             str1 = x.get_text()
                             print(str1)
                             f.write(str1 + "\n")
コード例 #4
0
def process(path):
    nega = posi = unce = liti = cons = supe = inte = 0

    fp = open(path, 'rb')
    praser = PDFParser(fp)
    doc = PDFDocument()
    praser.set_document(doc)
    doc.set_parser(praser)
    doc.initialize()
    fp.close()

    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()

            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    results = x.get_text().lower()
                    nega += count_word(results, negative)
                    posi += count_word(results, positive)
                    unce += count_word(results, uncertainty)
                    liti += count_word(results, litigious)
                    cons += count_word(results, constraining)
                    supe += count_word(results, superfluous)
                    inte += count_word(results, interesting)
    return [nega, posi, unce, liti, cons, supe, inte]
コード例 #5
0
def get_abstract(path):
    abstract = ""
    fr = open(path, mode="rb")
    praser = PDFParser(fr)
    doc = PDFDocument()
    praser.set_document(doc)
    doc.set_parser(praser)
    doc.initialize()

    flag = False

    if doc.is_extractable:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            for x in layout:
                if isinstance(x, LTTextBoxHorizontal):
                    results = x.get_text()
                    if re.findall("abstract", results.lower()):
                        flag = True
                    if flag and len(results) > 500:
                        abstract = results.replace("-\n", "")
                        abstract = abstract.replace("\n", "")
                        return abstract
    return abstract
コード例 #6
0
ファイル: pdfwindows.py プロジェクト: ruoshengyuan/testone
def parse(oldpath, filepath):
    try:
        filepath1 = os.path.join(oldpath, filepath)
        fp = open(filepath1, 'rb')
        praser_pdf = PDFParser(fp)
        doc = PDFDocument()
        praser_pdf.set_document(doc)
        doc.set_parser(praser_pdf)
        doc.initialize()
        if not doc.is_extractable:
            raise PDFTextExtractionNotAllowed
        else:
            rsrcmgr = PDFResourceManager()
            laparams = LAParams()
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            results = []
            page = next(doc.get_pages())
            interpreter.process_page(page)
            layout = device.get_result()
            for out in layout:
                if isinstance(out, LTTextBoxHorizontal):
                    results.append(out.get_text().strip("\n"))
            return (results)
    except Exception as e:
        print("a", str(e))
        return False
コード例 #7
0
ファイル: dumppdf.py プロジェクト: zjx1015288314/EasyTrans
def dumppdf(outfp, fname, objids, pagenos, password='',
            dumpall=False, codec=None):
    doc = PDFDocument()
    fp = open(fname, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)
    if objids:
        for objid in objids:
            obj = doc.getobj(objid)
            dumpxml(outfp, obj, codec=codec)
    if pagenos:
        for (pageno,page) in enumerate(doc.get_pages()):
            if pageno in pagenos:
                if codec:
                    for obj in page.contents:
                        obj = stream_value(obj)
                        dumpxml(outfp, obj, codec=codec)
                else:
                    dumpxml(outfp, page.attrs)
    if dumpall:
        dumpallobjs(outfp, doc, codec=codec)
    if (not objids) and (not pagenos) and (not dumpall):
        dumptrailers(outfp, doc)
    fp.close()
    if codec not in ('raw','binary'):
        outfp.write('\n')
コード例 #8
0
ファイル: MyPdfMiner.py プロジェクト: i11uminator/bookservice
 def ParseAllPages(self, filepath):
     # Open a PDF file.
     self.filepath = filepath
     fp = open(filepath, 'rb')
     # Create a PDF parser object associated with the file object.
     parser = PDFParser(fp)
     # Create a PDF document object that stores the document structure.
     doc = PDFDocument()
     # Connect the parser and document objects.
     parser.set_document(doc)
     doc.set_parser(parser)
     # Supply the password for initialization.
     # (If no password is set, give an empty string.)
     password = ""
     doc.initialize(password)
     # Check if the document allows text extraction. If not, abort.
     if not doc.is_extractable:
         raise PDFTextExtractionNotAllowed
     # Create a PDF resource manager object that stores shared resources.
     rsrcmgr = PDFResourceManager()
     # Create a PDF device object.
     device = PDFDevice(rsrcmgr)
     # Create a PDF interpreter object.
     interpreter = PDFPageInterpreter(rsrcmgr, device)
     # Process each page contained in the document.
     for page in doc.get_pages():
         interpreter.process_page(page)
コード例 #9
0
def parse(path):
    print(path)
    fp, pf = set_path(path)
    if os.path.exists(pf):
        return
    fp = open(fp, "rb")
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)

    doc.initialize()

    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()

            for x in layout:
                if isinstance(x, LTTextBoxHorizontal):
                    with open(pf, "a", encoding="utf-8") as f:
                        results = x.get_text()
                        f.write(results + "\n")
コード例 #10
0
ファイル: __init__.py プロジェクト: Polyconseil/dokang_pdf
 def harvest_file(self, path):
     with open(path, 'rb') as fp:
         # FIXME: how do we know which encoding to use? Should we
         # use 'chardet' to detect it?
         encoding = 'utf-8'
         parser = PDFParser(fp)
         if HAS_PDFMINER_3K:
             doc = PDFDocument()
             parser.set_document(doc)
             doc.set_parser(parser)
         else:
             doc = PDFDocument(parser)
         title = doc.info[0].get('Title', '')
         if isinstance(title, PDFObjRef):
             title = title.resolve()
         if isinstance(title, bytes):
             # This may not be necessary with pdfminer3k.
             try:
                 title = title.decode(encoding)
             except UnicodeDecodeError:
                 logger.warning('Could not correctly decode title of "%s".', path)
                 title = title.decode(encoding, 'ignore')
         fp.seek(0)
         content = extract_content(fp, encoding).strip()
         try:
             content = content.decode(encoding)
         except UnicodeDecodeError:
             logger.warning('Could not correctly decode content of "%s".', path)
             content = content.decode(encoding, 'ignore')
     return {
         'title': title,
         'content': content,
         'kind': 'PDF',
     }
コード例 #11
0
ファイル: pdftotext.py プロジェクト: mayhewsw/projects
def pdf_to_text(filename):
    from cStringIO import StringIO  
    from pdfminer.converter import LTChar, TextConverter    #<-- changed
    from pdfminer.layout import LAParams
    from pdfminer.pdfparser import PDFDocument, PDFParser
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter

    rsrc = PDFResourceManager()
    outfp = StringIO()
    device = TextConverter(rsrc, outfp, codec="utf-8", laparams=LAParams()) 

    doc = PDFDocument()
    fp = open(filename, 'rb')
    parser = PDFParser(fp)       
    parser.set_document(doc)     
    doc.set_parser(parser)       
    doc.initialize('')

    interpreter = PDFPageInterpreter(rsrc, device)

    print "There are: " + str(len(list(doc.get_pages()))) + " pages"

    for i, page in enumerate(doc.get_pages()):
        outfp.write("START PAGE %d\n" % i)
        if page is not None:
            interpreter.process_page(page)
        outfp.write("END PAGE %d\n" % i)

    device.close()
    fp.close()

    return outfp.getvalue()
コード例 #12
0
def extractContent(file):
    print "extractContent"

    fp = open(file, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)

    rsrcmgr = PDFResourceManager()
    codec = 'UTF-8'
    laparams = LAParams()
    outfp = StringIO.StringIO()

    device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    #if not doc.is_extractable:
    #    return None

    for i, page in enumerate(doc.get_pages()):
        print "page=" + str(i)
        if page is not None:
            interpreter.process_page(page)
    print "EOF"
    device.close()
    fp.close()

    return outfp.getvalue()
コード例 #13
0
def extractText(file_name):
    """
    extract text in file
    """
    connection = open(file_name, 'rb')
    parser = PDFParser(connection)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    laparams.char_margin = 1.0
    laparams.word_margin = 1.0
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    extracted_text = ''

    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                extracted_text += lt_obj.get_text()
    return extracted_text
コード例 #14
0
    def text(self) -> List[Tuple[int, str]]:
        """ Convert pdf pages into a list of text strings. """

        from pdfminer.converter import PDFPageAggregator
        from pdfminer.layout import LTTextBoxHorizontal, LAParams
        from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
        from pdfminer.pdfparser import PDFParser, PDFDocument

        with open(self._path, 'rb') as pdf_file:
            doc = PDFDocument()

            parser = PDFParser(pdf_file)
            parser.set_document(doc)
            doc.set_parser(parser)
            doc.initialize()
            if not doc.is_extractable:
                raise Exception('The Pdf text extraction is not allowed when procesing ' + self._path)

            rsrcmgr = PDFResourceManager()
            laparams = LAParams()
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)

            result = []
            for index, page in enumerate(doc.get_pages()):
                interpreter.process_page(page)
                layout = device.get_result()

                current_page_text = ''.join([x.get_text().strip() for x in layout
                                             if isinstance(x, LTTextBoxHorizontal)])
                result.append((index, current_page_text))

        return result
コード例 #15
0
ファイル: main.py プロジェクト: Apoorva412/TeamStark
def fetch_pdf_urls(file_name):
    try:
        links = []
        file_pointer = open(file_name, 'rb')
        parser = PDFParser(file_pointer)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')

        # fetches URLs
        for page in doc:
            if 'Annots' in page.attrs.keys():
                link_object_list = page.attrs['Annots']
                # Due to implementation of pdfminer the link_object_list can either
                # be the list directly or a PDF Object reference
                if type(link_object_list) is not list:
                    link_object_list = link_object_list.resolve()
                for link_object in link_object_list:
                    if type(link_object) is not dict:
                        link_object = link_object.resolve()
                    if link_object['A']['URI']:
                        links.append(link_object['A']['URI'])
        file_pointer.close()
        return links

    except Exception as e:
        logging.error('Error while fetching URLs : ' + str(e))
        return ''
コード例 #16
0
def getPDFMetadata(path):

    result = {}

    fp = open(path, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize()

    result = doc.info

    if 'Metadata' in doc.catalog:
        metadata = resolve1(doc.catalog['Metadata']).get_data()
        
        try:
            result.update( metadata ) # The raw XMP metadata
            
        except:
            pass
            
        try:
            result.update( xmp_to_dict(metadata) )
            
        except:
            pass

    return result[0]
コード例 #17
0
ファイル: main2.py プロジェクト: sheep9159/PDF_keyword_find
def readPdf(dir_and_name, pdf_file, num):

    fp = pdf_file
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)

    doc.initialize()

    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    if 'Airbus' in x.get_text() or 'airbus' in x.get_text():
                        file_origin = dir_and_name
                        file_target = 'result' + '/' + str(num) + '.' + 'pdf'
                        shutil.copyfile(file_origin, file_target)
                        num += 1
                        break

    return num
コード例 #18
0
    def create_pages(self):
        """Apply parsing function, returning the results"""

        from public_project.models import Page
        # create a parser object associated with the file object
        parser = PDFParser(self.pdf_file)
        # create a PDFDocument object that stores the document structure
        doc = PDFDocument()
        # connect the parser and document objects
        parser.set_document(doc)
        doc.set_parser(parser)
        # supply the password for initialization
        pdf_pwd = ''
        doc.initialize(pdf_pwd)

        if doc.is_extractable:
            # apply the function and return the result
            doc_pages = self._parse_pages(doc)

        i = 1
        for doc_page in doc_pages:
            page = Page(
                document=self.document,
                number=i,
                content = smart_unicode(doc_page, encoding='utf-8', strings_only=False, errors='strict'),
            )
            page.save()
            i = i + 1
コード例 #19
0
ファイル: pdf2text.py プロジェクト: zaim/bukutip
def process_pdf(rsrcmgr, device, fp, pagenums=None, maxpages=100, password=''):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument()
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.set_parser(parser)
    # Supply the document password for initialization.
    # (If no password is set, give an empty string.)
    doc.initialize(password)
    # Check if the document allows text extraction. If not, abort.
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    pages = dict(enumerate(doc.get_pages()))
    for num, page in pages.iteritems():
        if pagenums and (num not in pagenums):
            continue
        interpreter.process_page(page)
        if maxpages and maxpages <= num + 1:
            break
    return pages
コード例 #20
0
ファイル: pdf.py プロジェクト: MikaYuoadas/Docbucket
class Pdf(object):

    def __init__(self, pdf_file):
        parser = PDFParser(pdf_file)
        self._doc = PDFDocument()
        parser.set_document(self._doc)
        self._doc.initialize
        self._doc.set_parser(parser)

    @property
    def pages(self):
        return len(tuple(self._doc.get_pages()))

    def to_text(self):
        rsrcmgr = PDFResourceManager()
        output = StringIO()
        laparams = LAParams()
        laparams.detect_vertical = True
        laparams.all_texts = True
        laparams.word_margin = 0.4
        device = TextConverter(rsrcmgr, output, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in self._doc.get_pages():
                interpreter.process_page(page)
        return output.getvalue().decode('utf-8', 'ignore')
コード例 #21
0
def parse_pdf(pdf_url):

    remote_file = urllib.request.urlopen(pdf_url).read()
    memory_file = io.BytesIO(remote_file)
    parser = PDFParser(memory_file)
    doc = PDFDocument()
    parser.set_document(doc)
    #Warning sometimes, error in pdf?
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    ret = []
    # Process each page contained in the document.
    for pageIdx, page in enumerate(doc.get_pages()):
        ret.append([])
        interpreter.process_page(page)
        layout = device.get_result()
        for idx, lt_obj in enumerate(layout):
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                if len(lt_obj.get_text().strip()) > 0:
                    ret[pageIdx].append((lt_obj.get_text().splitlines()))
    return ret
コード例 #22
0
def parse():
    fp=open('20150623043633273.pdf','rb')
    praser=PDFParser(fp)
    doc=PDFDocument()
    praser.set_document(doc)
    
    doc.set_parser(praser)
    doc.initialize()

    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr=PDFResourceManager()
        laparams=LAParams()
        device=PDFPageAggregator(rsrcmgr,laparams=laparams)
        interpreter=PDFPageInterpreter(rsrcmgr,device)
        
        for page in doc.get_pages():
            interpreter.process_page(page)
            layout=device.get_result()

            for x in layout:
                if(isinstance(x,LTTextBoxHorizontal)):
                    with open(r'2015g.txt')as f:
                        results=x.get_text()
                        print(results)
                        f.write(results+'\n')
コード例 #23
0
ファイル: pdfutils.py プロジェクト: emulbreh/ecs
def pdf_isvalid(filelike):
    ''' returns True if valid pdf, else False
    @param filelike: filelike object, seekable
    '''
    logger = logging.getLogger()
    isvalid = False    
    filelike.seek(0)  
    
    if filelike.read(len(PDF_MAGIC)) != PDF_MAGIC:
        return False
    else:
        filelike.seek(0)
    try:
        parser = PDFParser(filelike)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')
        if doc.is_extractable:
            isvalid = True
    except PDFException as excobj:
        logger.warning("pdf has valid header but, still not valid pdf, exception was %r" %(excobj))
        isvalid = False
            
    filelike.seek(0)
    return isvalid
コード例 #24
0
ファイル: dumppdf.py プロジェクト: joshmgrant/pdfminer
def dumpoutline(outfp, fname, objids, pagenos, password='',
                dumpall=False, codec=None):
    doc = PDFDocument()
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)
    pages = dict( (page.pageid, pageno) for (pageno,page) in enumerate(doc.get_pages()) )
    for (level,title,dest,a,se) in doc.get_outlines():
        pageno = None
        if dest:
            dest = resolve1( doc.lookup_name('Dests', dest) )
            if isinstance(dest, dict):
                dest = dest['D']
            pageno = pages[dest[0].objid]
        elif a:
            action = a.resolve()
            if isinstance(action, dict):
                subtype = action.get('S')
                if subtype and repr(subtype) == '/GoTo' and action.get('D'):
                    dest = action['D']
                    pageno = pages[dest[0].objid]
        outfp.write(repr((level,title,dest,pageno))+'\n')
    parser.close()
    fp.close()
    return
コード例 #25
0
ファイル: pdf.py プロジェクト: staffanm/protokollen
    def get_metadata(self):
        """Returns metadata from both
    	   the info field (older PDFs) and XMP (newer PDFs).
           Return format is a .modules.metadata.Metadata object
    	"""
        file_pointer = open(self.path, 'rb')
        parser = PDFParser(file_pointer)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize()
        metadata = Metadata()
        for i in doc.info:
            metadata.add(i)
        if 'Metadata' in doc.catalog:
            xmp_metadata = resolve1(doc.catalog['Metadata']).get_data()
            xmp_dict = xmp_to_dict(xmp_metadata)
            #Let's add only the most useful one
            if "xap" in xmp_dict:
                metadata.add(xmp_dict["xap"])
            if "pdf" in xmp_dict:
                metadata.add(xmp_dict["pdf"])
            if "dc" in xmp_dict:
                metadata.add(xmp_dict["dc"], metadataType="dc")
        file_pointer.close()

        self.metadata = metadata
        return metadata
コード例 #26
0
ファイル: PDF_Parser.py プロジェクト: samdavey/Random
    def load( self, open_file ):
        self.fields = {}
        self.text= {}

        # Create a PDF parser object associated with the file object.
        parser = PDFParser(open_file)
        # Create a PDF document object that stores the document structure.
        doc = PDFDocument()
        # Connect the parser and document objects.
        parser.set_document(doc)
        doc.set_parser(parser)
        # Supply the password for initialization.
        # (If no password is set, give an empty string.)
        doc.initialize('')
        # Check if the document allows text extraction. If not, abort.
        if not doc.is_extractable:
            raise PDFTextExtractionNotAllowed
        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()
        # Set parameters for analysis.
        laparams = LAParams()
        # Create a PDF page aggregator object.
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # Process each page contained in the document.
        for pgnum, page in enumerate( doc.get_pages() ):
            interpreter.process_page(page)
            if page.annots:
                self._build_annotations( page )
            txt= self._get_text( device )
            self.text[pgnum+1]= txt
コード例 #27
0
ファイル: pdfmeta.py プロジェクト: kristerhedfors/bin
def get_pdf_metadata(fileOrUrl, textmode=False, prefix='', basicauth=None):
    if len(args) > 1:
        prefix = fileOrUrl + ':'
    fp = None
    if fileOrUrl.startswith('http://') or fileOrUrl.startswith('https://'):
        request = urllib2.Request(fileOrUrl)
        if basicauth:
            request.add_header('Authorization', 'Basic ' + basicauth)
        fobj = urllib2.urlopen(request)
        pdfdata = fobj.read()
        fobj.close()
        fp = StringIO.StringIO(pdfdata)
    else:
        fp = open(fileOrUrl, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize()
    fp.close()
    if textmode:
        for obj in doc.info:
            for (name, val) in obj.iteritems():
                print '{0}:{1}={2}'.format(
                    fileOrUrl, name, val
                )
    else:
        val = doc.info
        if type(val) is list and len(val) == 1:
            val = val[0]
        print prefix + str(val)
コード例 #28
0
ファイル: metadataPDF.py プロジェクト: TechByTom/metagoofil
	def getData(self):
		doc = PDFDocument()
		fp = file(self.fname, 'rb')
		parser = PDFParser(fp)
		try:
			parser.set_document(doc)
			doc.set_parser(parser)
			doc.initialize(self.password)
		except:
			return "error"
		
		parser.close()
		fp.close()
		#try:
		#	metadata = resolve1(doc.catalog['Metadata'])
		#	return "ok"
		#except:
		#	print "[x] Error in PDF extractor, Metadata catalog"
		try:
			for xref in doc.xrefs:
				info_ref=xref.trailer.get('Info')
				if info_ref:
					info=resolve1(info_ref)
				self.metadata=info
				self.raw = info
			if self.raw == None:
				return "Empty metadata"
			else:
				return "ok"
		except Exception,e:
			return e 
			print "\t [x] Error in PDF extractor, Trailer Info"
コード例 #29
0
ファイル: pdfinfo.py プロジェクト: larscwallin/pdfdig
    def get_toc(self):
        fp = open(self.pdf, 'rb')
        parser = PDFParser(fp)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')
        
        # title
        if doc.info:
            metadict = doc.info[0]
            if 'Title' in metadict.keys():
                self.title = normalize_title(metadict['Title'])

        # level 1 of toc
        try:
            outlines = doc.get_outlines()
            toc = list()
            select_level = self.get_level1(outlines)
        except:
            return None
        for (level,title,dest,a,se) in doc.get_outlines():
            if level==select_level:
                toc.append(normalize_toc_item(title))
        return toc
コード例 #30
0
def initialize_pdf_miner(fh):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fh)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument()
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.set_parser(parser)
    # Supply the password for initialization.
    # (If no password is set, give an empty string.)
    doc.initialize("")
    # Check if the document allows text extraction. If not, abort.
    if not doc.is_extractable:
        raise ValueError("PDFDocument is_extractable was False.")
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    # for page in doc.get_pages():
    #    interpreter.process_page(page)

    # Set parameters for analysis.
    laparams = LAParams()
    laparams.word_margin = 0.0
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    return doc, interpreter, device
コード例 #31
0
ファイル: iocp.py プロジェクト: sebdraven/ioc_parser
    def parse_pdf_pdfminer(self, f, fpath):
        try:
            laparams = LAParams()
            laparams.all_texts = True  
            rsrcmgr = PDFResourceManager()
            pagenos = set()

            if self.dedup:
                self.dedup_store = set()

            self.handler.print_header(fpath)
            page_num = 0
            parser= PDFParser(f)
            doc = PDFDocument(caching=True)

            parser.set_document(doc)
            doc.set_parser(parser)
            for page in doc.get_pages():
                retstr = StringIO()
                device = TextConverter(rsrcmgr, retstr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                page_num += 1
                interpreter.process_page(page)
                data = retstr.getvalue()
                self.parse_page(fpath, bytes(data,'UTF-8'), page_num)
                retstr.close()
            self.handler.print_footer(fpath)
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            self.handler.print_error(fpath, e)
コード例 #32
0
ファイル: MyPdfMiner.py プロジェクト: i11uminator/bookservice
 def WithPdf(self, pdfdoc, password, fn, *args):
     """Open the pdf document, and apply the function, returning the results"""
     result = None
     try:
         # open the pdf file
         fp = open(pdfdoc, 'rb')
         # create a parser object associated with the file object
         parser = PDFParser(fp)
         # create a PDFDocument object that stores the document structure
         doc = PDFDocument()
         # connect the parser and document objects
         parser.set_document(doc)
         doc.set_parser(parser)
         # supply the password for initialization
         if password:
             self.password = password
         doc.initialize(self.password)
 
         if doc.is_extractable:
             # apply the function and return the result
             result = fn(doc, *args)
 
         # close the pdf file
         fp.close()
     except IOError:
         # the file doesn't exist or similar problem
         pass
     return result
コード例 #33
0
ファイル: pdf.py プロジェクト: hsoft/pdfmasher
def extract_text_elements_from_pdf(path, j=nulljob):
    """Opens a PDF and extract every element that is text based (LTText).
    """
    fp = open(path, 'rb')
    doc = PDFDocument(caching=True)
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize()
    rsrcmgr = PDFResourceManager()
    laparams = LAParams(all_texts=True, paragraph_indent=5, heuristic_word_margin=True)
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = []
    all_elements = []
    enumerated_pages = list(enumerate(doc.get_pages()))
    progress_msg = "Reading page %i of %i"
    for pageno, page in j.iter_with_progress(enumerated_pages, progress_msg):
        interpreter.process_page(page)
        page_layout = device.get_result()
        pages.append(Page(page_layout.width, page_layout.height))
        textboxes = extract_textboxes(page_layout)
        elements = [create_element(box) for box in textboxes]
        merge_oneletter_elems(elements)
        for i, elem in enumerate(elements):
            elem.page = pageno
            elem.order = i
        all_elements += elements
    return pages, all_elements
コード例 #34
0
ファイル: pdfInvoiceMiner.py プロジェクト: vinovator/Vinlab
def read_invoice_pdfminer3k(pdfFile):
    fp = open(os.path.join(invoice_path + "\\" + pdfFile), "rb")

    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)

    doc.initialize("")
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()

    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Process each page contained in the document.
    invoice_text = ""
    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                invoice_text += lt_obj.get_text()

    # Extract client info from the string extracted from pdf
    client = extract_info(invoice_text, client_start, client_end)
    print("client :" + client)

    # Extract invoice no from the pdf file name
    invoice_no = extract_info(str(pdfFile), invoice_start, invoice_end)
    print("invoice no :" + invoice_no)

    # Pass the client info and invoice no to the method which writes to excel file
    write_excel(client, invoice_no)
コード例 #35
0
class PdfSerializer(object):
    def __init__(self, filename):
        self.__filename = filename

        fp = open(self.__filename, 'rb')
        parser = PDFParser(fp)
        self.__doc = PDFDocument()
        parser.set_document(self.__doc)
        self.__doc.set_parser(parser)
        self.__doc.initialize('')

    def writeToTxt(self):
        text = self.getString()
        txtFile = open(self.__filename.replace(".pdf", ".txt"), "w")
        txtFile.write(text.encode('ascii','replace').decode("utf-8"))
        txtFile.close()

    def getString(self):
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        string = StringIO()
        device = TextConverter(rsrcmgr, string, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in self.__doc.get_pages():
            interpreter.process_page(page)
        return string.getvalue()
コード例 #36
0
ファイル: book_parser.py プロジェクト: ArcainOne/anathema
    def pdf_function(pdf_doc, password='', *args, **kwargs):
        result = None
        try:
            # open the pdf file
            fp = open(pdf_doc, 'rb')
            # create a parser object associated with the file object
            parser = PDFParser(fp)
            # create a PDFDocument object that stores the document structure
            doc = PDFDocument()
            # connect the parser and document objects
            parser.set_document(doc)
            doc.set_parser(parser)
            # supply the password for initialization
            doc.initialize(password)

            if doc.is_extractable:
                # apply the function and return the result
                result = function(doc, *args, **kwargs)

            # close the pdf file
            fp.close()
        except IOError:
            # the file doesn't exist or similar problem
            pass
        return result
コード例 #37
0
def getData(fileName):
 doc = PDFDocument()
 fp = file(fileName, 'rb')
 parser = PDFParser(fp)
 try:
  parser.set_document(doc)
  doc.set_parser(parser)
 except:
  return "error"
   
 parser.close()
 fp.close()
 try:
  for xref in doc.xrefs:
   info_ref=xref.trailer.get('Info')
   if info_ref:
    info=resolve1(info_ref)
   metadata=info
   if metadata == None:
    return "Empty metadata"
   else:
    if metadata.has_key('Author'):
     print("Author "+metadata['Author'])
    if metadata.has_key('Company'):
     print("Company "+metadata['Company'])
    if metadata.has_key('Producer'):
     print("Producer "+metadata['Producer'])
    if metadata.has_key('Creator'):
     print("Creator "+metadata['Creator'])         
 except Exception,e:
  print "\t [x] Error in PDF extractor"
  return e 
コード例 #38
0
ファイル: autosumpdf.py プロジェクト: suriyan/autosum
def convert_pdf_to_txt(path):

    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()

    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    with open(path, 'rb') as fp:
        parser = PDFParser(fp)

        doc = PDFDocument(caching=True)
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')

        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.

        for page in doc.get_pages():
            interpreter.process_page(page)
        text = retstr.getvalue()

    device.close()
    retstr.close()

    return text
コード例 #39
0
ファイル: PdfParser.py プロジェクト: hcouch21/styloproject
    def parse(self, path):
		out = StringIO.StringIO()
		fp = None
        # Directory
		if os.path.isdir(path):
			raise NotImplementedError()
        # File
	       	else:
			fp = file(path)		
		rsrc = PDFResourceManager()
		codec = 'utf-8'
		laparams = LAParams()
		laparams.char_margin = 2.0
		laparams.line_margin = 2.0
		laparams.word_margin = 0.0
		device = TextConverter(rsrc, out, codec=codec, laparams=laparams)
		doc = PDFDocument()
		parser = PDFParser(fp)
		parser.set_document(doc)
		doc.set_parser(parser)
		doc.initialize()
		interpreter = PDFPageInterpreter(rsrc, device)
		for page in doc.get_pages():
			interpreter.process_page(page)
		device.close()
		sample = Sample(path, None, out.getvalue())
		out.close()
		return sample
コード例 #40
0
ファイル: dumppdf.py プロジェクト: Adniel/ComparePdf
def dumppdf(outfp, fname, objids, pagenos, password='',
            dumpall=False, codec=None):
    doc = PDFDocument()
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)
    if objids:
        for objid in objids:
            obj = doc.getobj(objid)
            dumpxml(outfp, obj, codec=codec)
    if pagenos:
        for (pageno,page) in enumerate(doc.get_pages()):
            if pageno in pagenos:
                if codec:
                    for obj in page.contents:
                        obj = stream_value(obj)
                        dumpxml(outfp, obj, codec=codec)
                else:
                    dumpxml(outfp, page.attrs)
    if dumpall:
        dumpallobjs(outfp, doc, codec=codec)
    if (not objids) and (not pagenos) and (not dumpall):
        dumptrailers(outfp, doc)
    fp.close()
    if codec not in ('raw','binary'):
        outfp.write('\n')
    return
コード例 #41
0
ファイル: statement2csv.py プロジェクト: jlas/misc
def pdf_to_csv(filename):
    # ... the following part of the code is a remix of the
    # convert() function in the pdfminer/tools/pdf2text module
    rsrc = PDFResourceManager()
    outfp = StringIO()
    device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())
    # becuase my test documents are utf-8 (note: utf-8 is the default codec)

    doc = PDFDocument()
    fp = open(filename, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')

    interpreter = PDFPageInterpreter(rsrc, device)
    for i, page in enumerate(doc.get_pages()):
        outfp.write("START PAGE %d\n" % i)
        if page is not None:
            interpreter.process_page(page)
        outfp.write("END PAGE %d\n" % i)

    device.close()
    fp.close()

    return outfp.getvalue()
コード例 #42
0
ファイル: showcells.py プロジェクト: aliounedia/scraptils
def pdf2csv(fp):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument()
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.set_parser(parser)
    # Supply the password for initialization.
    # (If no password is set, give an empty string.)
    doc.initialize('')
    # Check if the document allows text extraction. If not, abort.
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Set parameters for analysis.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    for pageno, page in enumerate(doc.get_pages()):
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        #import code; code.interact(local=locals());
        hlines=[]
        vlines=[]
        for i in layout:
            if not type(i) in (LTRect, LTLine): continue
            hlines.append(int(i.x0))
            hlines.append(int(i.x1))
            vlines.append(int(layout.height - i.y0))
            vlines.append(int(layout.height - i.y1))
        hlines=filterclose(sorted(set(hlines)))
        vlines=filterclose(sorted(set(vlines)))
        print hlines
        print vlines
        print (layout.width, layout.height)
        i=0
        im = Image.new('1', (int(layout.width), int(layout.height)))
        draw = ImageDraw.Draw(im)
        while(i<len(vlines)-1):
            if not vlines[i+1]-vlines[i]>5:
                i=i+1
                continue
            j=0
            while(j<len(hlines)-1):
                if not hlines[j+1]-hlines[j]>5:
                    j=j+1
                    continue
                draw.rectangle([(int(hlines[j]),int(vlines[i])),(int(hlines[j+1]),int(vlines[i+1]))], outline=1)
                j=j+1
            i=i+1
        del draw
        fp=open("out%s.png" % pageno,'wb')
        im.save(fp,"PNG")
        fp.close()
コード例 #43
0
ファイル: miner.py プロジェクト: ReimuYk/Group16-PaperClip
def parse(path):
    fp = open(path, 'rb')  # 以二进制读模式打开
    #用文件对象来创建一个pdf文档分析器
    praser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器 与文档对象
    praser.set_document(doc)
    doc.set_parser(praser)

    # 提供初始化密码
    # 如果没有密码 就创建一个空的字符串
    doc.initialize()

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf 资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        paper = pdf()
        # 循环遍历列表,每次处理一个page的内容
        for page in doc.get_pages():  # doc.get_pages() 获取page列表
            print(page)
            paper.newPage()
            size = getPageSize(page)
            paper.setSize(size)
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
            for x in layout:
                ##                print(x)
                if (isinstance(x, LTTextBoxHorizontal)):
                    paper.newBox()
                    for l in x:
                        paper.newLine()
                        paper.divideWord(l)


##                        print(l)
##                        print(l.bbox)
##                        print("content:"+l.get_text())
##                        for c in l:
##                            print(c,end='\n')
##                        print()
##                        return
##                if (isinstance(x, LTTextBoxHorizontal)):
##                    with open(r'1.txt', 'a') as f:
##                        results = x.get_text()
##                        print(results)
##                        f.write(results + '\n')
        return paper
コード例 #44
0
def MapFactory(map_path):
    try:
        map_file = file(map_path, "rb")
    except:
        return None

    document = PDFDocument()

    try:
        parser = PDFParser(map_file)
        parser.set_document(document)
        document.set_parser(parser)
        document.initialize("")
    except:
        return None

    obj = document.getobj(_PDF_OBJ_INDEX_)
    if not obj or not isinstance(obj, PDFStream):
        return None

    if not "Width" in obj:
        return None
    if not "Height" in obj:
        return None
    if not "ColorSpace" in obj:
        return None

    width = obj["Width"]
    height = obj["Height"]
    map_class = None

    weird_pdf = height == 1

    data = None
    if weird_pdf:
        data, height = _ProcessWeirdPDF(document)
    else:
        data = obj.get_data()

    if (width == MapA4Portrait.WIDTH and height == MapA4Portrait.HEIGHT):
        map_class = MapA4Portrait
    elif (width == MapA4Landscape.WIDTH and height == MapA4Landscape.HEIGHT):
        map_class = MapA4Landscape
    elif (width == MapA3Portrait.WIDTH and height == MapA3Portrait.HEIGHT):
        map_class = MapA3Portrait
    elif (width == MapA3Landscape.WIDTH and height == MapA3Landscape.HEIGHT):
        map_class = MapA3Landscape
    elif (width == MapA2Portrait.WIDTH and height == MapA2Portrait.HEIGHT):
        map_class = MapA2Portrait
    elif (width == MapA2Landscape.WIDTH and height == MapA2Landscape.HEIGHT):
        map_class = MapA2Landscape
    elif (width == MapA1Portrait.WIDTH and height == MapA1Portrait.HEIGHT):
        map_class = MapA1Portrait
    elif (width == MapA1Landscape.WIDTH and height == MapA1Landscape.HEIGHT):
        map_class = MapA1Landscape
    else:
        return None

    return map_class(_MakePPMImage(width, height, data), map_path)
コード例 #45
0
def get_doc_pages(filename):
    fp = open(filename, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize()
    return doc.get_pages()
コード例 #46
0
def parsePDFfile(filepath):
    fp = open(filepath, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    parser.set_document(doc)
    doc.set_parser(parser)
    fp.close()
    return doc
コード例 #47
0
def read_pdf(pdf, fileName):
    # 创建一个一个与文档关联的解释器
    parser = PDFParser(pdf)
    # PDF文档的对象
    doc = PDFDocument()
    # 连接解释器和文档对象
    parser.set_document(doc)
    doc.set_parser(parser)
    # 初始化文档,当前文档没有密码,设为空字符串
    doc.initialize("")
    # 创建PDF资源管理器
    resource = PDFResourceManager()
    # 参数分析器
    laparam = LAParams()
    # 创建一个聚合器
    device = PDFPageAggregator(resource, laparams=laparam)
    # 创建PDF页面解释器
    interpreter = PDFPageInterpreter(resource, device)

    for index, page in enumerate(doc.get_pages()):
        if index == 0:
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
            lines = []
            for index, x in enumerate(layout):
                if (isinstance(x, LTTextBoxHorizontal)):
                    #需要写出编码格式
                    #解决\u8457\u5f55\u683c\u5f0f\uff1a\u67cf\u6167乱码
                    results = x.get_text().encode('raw_unicode_escape').decode(
                        'unicode_escape')
                    print(x)
                    lines.append(results)

            if len(lines) != 0:
                info = []
                info.append(fileName)
                for index, str in enumerate(lines):
                    if ('经营者姓名' in str or '身份证号码' in str):
                        times = str.count('\n', 0, len(str))

                        # 出现两次\n过滤出数据
                        if times == 2:
                            reList = re.findall(".*\n(.*)\n.*", str)
                            if len(reList) != 0:
                                print('过滤取数据 = ' + reList[0])
                                info.append(reList[0])

                        else:
                            # 直接取下一个元素
                            if ((index + 1) < len(lines)):
                                print('取下一个下标数据 = ' + lines[index + 1].strip())
                                info.append(lines[index + 1].strip())

                if len(info) == 3:
                    input.append(info)
        break
コード例 #48
0
def from_pdf_to_txt(read_file, write_file, page_start=0, page_end=0):
    """

    :param read_file: str. 注意后缀名是".pdf"
    :param write_file: str. 注意后缀名填".txt"
    :param page_start: int
    :param page_end: int
    :return:
    """
    
    # 以二进制读模式打开
    origin_pdf_file = open(read_file, 'rb')
    # 用文件对象来创建一个pdf文档分析器
    parser = PDFParser(origin_pdf_file)
    # 创建一个pdf文档
    doc = PDFDocument()
    # 连接分析器与文档对象,这个语句比较有意思,相互set对方进去
    parser.set_document(doc)
    doc.set_parser(parser)
    
    # 提供初始化密码.如果pdf没有密码,就传入一个空参数
    doc.initialize()
    
    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        # 如果pdf不支持提取,则直接报错
        raise PDFTextExtractionNotAllowed
    else:
        # 创建pdf资源管理器 来管理共享资源
        srcmgr = PDFResourceManager()
        # 创建一个pdf设备对象
        device = PDFPageAggregator(srcmgr, laparams=LAParams())
        # 创建一个pdf解释器对象
        interpreter = PDFPageInterpreter(srcmgr, device)
        
        # 循环遍历列表,每次处理一个page的内容
        pages = list(doc.get_pages())
        if page_end == 0:
            page_end = len(pages)
        
        for i in range(page_start, page_end):
            interpreter.process_page(pages[i])
            
            # 接受该页面的LTPage对象
            layout = device.get_result()
            # 这里返回的是一个LTPage对象,里面存放着这个page解析出的各种对象
            # 一般包括LTTextBox,LTFigure,LTImage,LTTextBoxHorizontal等等
            # 想要获取文本就取它的text属性,即x.get_text()
            
            # 获取text属性
            for x in layout:
                if isinstance(x, LTTextBoxHorizontal):
                    with open(write_file, 'a', encoding='utf-8') as f:
                        results = x.get_text()
                        f.write(results + '\n')
        
        # 最后关闭原始pdf文件
        origin_pdf_file.close()
コード例 #49
0
def parsePDF(pdfFile):
    # 以二进制读模式打开
    fp = open(pdfFile, 'rb')
    #用文件对象来创建一个pdf文档分析器
    praser = PDFParser(fp)

    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器 与文档对象
    praser.set_document(doc)
    doc.set_parser(praser)

    # 文本的list
    textlist = ['.']

    # 提供初始化密码
    # 如果没有密码 就创建一个空的字符串
    doc.initialize()

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        # raise PDFTextExtractionNotAllowed
        return None
    else:
        # 创建PDf 资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # 循环遍历列表,每次处理一个page的内容
        # doc.get_pages() 获取page列表
        for page in doc.get_pages():
            # 接受该页面的LTPage对象
            interpreter.process_page(page)
            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象
            # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等
            # 想要获取文本就获得对象的text属性,
            layout = device.get_result()

            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    # with open(r'../../data/pdf/1.txt', 'a') as f:
                    #     results = x.get_text()
                    #     print(results)
                    #     f.write(results + '\n')
                    results = x.get_text()
                    #print(results)
                    #print('---------------------------')
                    textlist.append(results)

        device.close()

    return textlist
コード例 #50
0
def dumpoutline(outfp,
                fname,
                objids,
                pagenos,
                password='',
                dumpall=False,
                codec=None):
    doc = PDFDocument()
    fp = file(fname, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize(password)
    pages = dict(
        (page.pageid, pageno) for (pageno, page) in enumerate(doc.get_pages()))

    def resolve_dest(dest):
        if isinstance(dest, str):
            dest = resolve1(doc.get_dest(dest))
        elif isinstance(dest, PSLiteral):
            dest = resolve1(doc.get_dest(dest.name))
        if isinstance(dest, dict):
            dest = dest['D']
        return dest

    try:
        outlines = doc.get_outlines()
        outfp.write('<outlines>\n')
        for (level, title, dest, a, se) in outlines:
            pageno = None
            if dest:
                dest = resolve_dest(dest)
                pageno = pages[dest[0].objid]
            elif a:
                action = a.resolve()
                if isinstance(action, dict):
                    subtype = action.get('S')
                    if subtype and repr(subtype) == '/GoTo' and action.get(
                            'D'):
                        dest = resolve_dest(action['D'])
                        pageno = pages[dest[0].objid]
            s = e(title).encode('utf-8', 'xmlcharrefreplace')
            outfp.write('<outline level="%r" title="%s">\n' % (level, s))
            if dest is not None:
                outfp.write('<dest>')
                dumpxml(outfp, dest)
                outfp.write('</dest>\n')
            if pageno is not None:
                outfp.write('<pageno>%r</pageno>\n' % pageno)
            outfp.write('</outline>\n')
        outfp.write('</outlines>\n')
    except PDFNoOutlines:
        pass
    parser.close()
    fp.close()
    return
コード例 #51
0
ファイル: test.py プロジェクト: quyf88/ocr
def pdf_is_text(file_path):
    # TODO
    return False
    try:
        with open(file_path, 'rb') as file:
            praser = PDFParser(file)
            # 创建一个PDF文档
            doc = PDFDocument()
            # 连接分析器 与文档对象
            praser.set_document(doc)
            doc.set_parser(praser)
            # 提供初始化密码
            # 如果没有密码 就创建一个空的字符串
            doc.initialize()

            # 判断是否加密
            if doc.encryption:
                # pdf = fitz.Document(file_path)
                # pdf.save('fitz_decrypt.pdf')
                with open('fitz_decrypt.pdf', 'rb') as f:
                    praser = PDFParser(f)
                    doc = PDFDocument()
                    praser.set_document(doc)
                    doc.set_parser(praser)
                    doc.initialize()

            # print('is_extractable', doc.is_extractable)
            # 创建PDf 资源管理器 来管理共享资源
            rsrcmgr = PDFResourceManager()
            # 创建一个PDF设备对象
            laparams = LAParams()
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            # 创建一个PDF解释器对象
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            # 循环遍历列表,每次处理一个page的内容
            first_three = [0, 0, 0]
            for index, page in enumerate(
                    doc.get_pages()):  # doc.get_pages() 获取page列表
                if index < 3:
                    interpreter.process_page(page)
                    # 接受该页面的LTPage对象
                    layout = device.get_result()
                    for i in layout:
                        if isinstance(i, LTTextBoxHorizontal):
                            print(i.get_text())
                            first_three[index] += len(i.get_text())
                else:
                    break
            # print(first_three)
            # 如果前三页字数相同且少于50则判断为图片类PDF
            if max(first_three) < 50:  # == min(first_three) and first_three[0]
                return False
            else:
                return True
    except Exception as ex:
        return False
コード例 #52
0
def getpaperPDFtitle(paperpdfpath):
    #行计数
    linecount = 0
    strtitle = ''
    #print(paperpdfpath)
    fp = open(paperpdfpath, 'rb')
    #用文件对象创建一个PDF文档解析器
    parser = PDFParser(fp)
    #创建一个PDF文档
    doc = PDFDocument()
    #解析器与文档对象绑定
    parser.set_document(doc)
    doc.set_parser(parser)
    #提供初始化密码,如果没有密码,就创建一个空字符串
    doc.initialize()
    #检测文档是否提供txt转换,不提供忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        #创建PDF资源管理器
        rsrcmgr = PDFResourceManager()
        #创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        #创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    linecontent = x.get_text()
                    print(linecontent)
                    linecount = linecount + 1

                    #标题,一般所在的位置
                    if (linecount == 2):
                        #print(linecontent)
                        #print(isinstance(linecontent,str))
                        strtitlelist = linecontent.splitlines()
                        if (len(strtitlelist) > 1):
                            for strtemp in strtitlelist:
                                strtitle = strtitle + strtemp
                                strtitle = strtitle + " "
                        elif (len(strtitlelist) == 1):
                            strtitle = strtitle + strtitlelist[0]
                        else:
                            strtitle = ''
                        break

            break
    print(strtitle)
    if (len(strtitle) > 255):
        return strtitle[:32]
    else:
        return strtitle
コード例 #53
0
def get_name(urls, name, time, names):
    path1 = 'D:\PDF_530\\'
    fp = open(path1 + names + '.pdf', 'rb')  # 以二进制读模式打开
    # 用文件对象来创建一个pdf文档分析器
    praser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器 与文档对象
    praser.set_document(doc)
    doc.set_parser(praser)

    # 提供初始化密码
    # 如果没有密码 就创建一个空的字符串
    doc.initialize()
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf 资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # 循环遍历列表,每次处理一个page的内容
        content = ""
        for page in doc.get_pages():  # doc.get_pages() 获取page列表
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()

            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
            path = 'D:\Pdf_text\\'
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    with open(path + names + '.txt', 'a') as f:
                        results = x.get_text()
                        # print(results)
                        f.write(results + '\n')
                        content += results
        print(urls)
        if KBoxBrUtils.getCache(urls) == 0:
            KBoxBrUtils.setCache(urls)
            print(name)
            times = time.replace('[', '').replace(']', '')
            print(times)
            # print(content)
            insert = KBoxBrUtils.saveToDB("DT_NEWS_STANDARD_DATA",
                                          [{
                                              "NEWS_TITLE": name,
                                              "NEWS_TIME": times,
                                              "NEWS_CONTENT": content,
                                              "NEWS_FROM": "深交所",
                                              "NEWS_URL": urls,
                                          }])
            print(insert)
コード例 #54
0
ファイル: app_gpxx.py プロジェクト: sspp123/sxywb_tushare
def convert_pdf_to_txt(_path):
    # fp = open(_path, 'rb')  # rb以二进制读模式打开本地pdf文件
    request = Request(url=_path,
                      headers={'User-Agent':
                               USER_AGENT})  # 随机从user_agent列表中抽取一个元素
    fp = urlopen(request)  # 打开在线PDF文档

    # 用文件对象来创建一个pdf文档分析器
    praser_pdf = PDFParser(fp)

    # 创建一个PDF文档
    doc = PDFDocument()

    # 连接分析器 与文档对象
    praser_pdf.set_document(doc)
    doc.set_parser(praser_pdf)

    # 提供初始化密码doc.initialize("123456")
    # 如果没有密码 就创建一个空的字符串
    doc.initialize()

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()

        # 创建一个PDF参数分析器
        laparams = LAParams()

        # 创建聚合器
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)

        # 创建一个PDF页面解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # 循环遍历列表,每次处理一页的内容
        # doc.get_pages() 获取page列表
        str = ''
        for page in doc.get_pages():
            # 使用页面解释器来读取
            interpreter.process_page(page)

            # 使用聚合器获取内容
            layout = device.get_result()

            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
            for out in layout:
                # 判断是否含有get_text()方法,图片之类的就没有
                # if hasattr(out,"get_text"):
                if isinstance(out, LTTextBoxHorizontal):
                    results = out.get_text()
                    str += results
        return str
    fp.close()
コード例 #55
0
def load_form(filename):
    """Load pdf form contents into a nested list of name/value tuples"""
    with open(filename, 'rb') as file:
        parser = PDFParser(file)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize()
        return [load_fields(resolve1(f)) for f in
                   resolve1(doc.catalog['AcroForm'])['Fields']]
コード例 #56
0
ファイル: PDFTables1.py プロジェクト: fagan2888/PDFTables
def open_pdf(filepath):
    """ Read in a PDF file, create a PDFMiner document object and return it. """
    fp = open(filepath, 'rb')  # Open the file
    parser = PDFParser(fp)  # Create the parser
    doc = PDFDocument()  # Create the document object
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')

    return doc
コード例 #57
0
def parse1():
    #rb以二进制读模式打开本地pdf文件
    fn = open(r'D:\1\002.pdf', 'rb')

    #创建一个pdf文档分析器
    parser = PDFParser(fn)

    #创建一个PDF文档
    doc = PDFDocument()

    #连接分析器 与文档对象
    parser.set_document(doc)
    doc.set_parser(parser)

    # 提供初始化密码doc.initialize("lianxipython")
    # 如果没有密码 就创建一个空的字符串
    doc.initialize("")

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed

    else:
        #创建PDf资源管理器
        resource = PDFResourceManager()

        #创建一个PDF参数分析器
        laparams = LAParams()

        #创建聚合器,用于读取文档的对象
        device = PDFPageAggregator(resource, laparams=laparams)

        #创建解释器,对文档编码,解释成Python能够识别的格式
        interpreter = PDFPageInterpreter(resource, device)

        # 循环遍历列表,每次处理一页的内容
        # doc.get_pages() 获取page列表
        for page in doc.get_pages():

            #利用解释器的process_page()方法解析读取单独页数
            interpreter.process_page(page)

            #使用聚合器get_result()方法获取内容
            layout = device.get_result()

            #这里layout是一个LTPage对象,里面存放着这个page解析出的各种对象
            for out in layout:
                #判断是否含有get_text()方法,获取我们想要的文字
                if hasattr(out, "get_text"):

                    print(out.get_text())
                    #  with open(r'D:\1\test.txt','a',encoding='utf-8') as f:
                    with open(r'D:\1\test.txt', 'a', encoding='utf-8') as f:

                        f.write(out.get_text() + '\n')
コード例 #58
0
    def translate(self):
        '''读取pdf内容,并翻译,写入txt文件'''

        # 以二进制读模式打开本地pdf文件
        fp = open(self.fullPath, 'rb')
        # 用文件对象来创建一个pdf文档分析器
        praser_pdf = PDFParser(fp)
        # 创建一个PDF文档
        doc_pdf = PDFDocument()
        # 连接分析器与文档对象
        praser_pdf.set_document(doc_pdf)
        doc_pdf.set_parser(praser_pdf)
        # 提供初始化密码doc.initialize("123456"),如果没有密码 就创建一个空的字符串
        doc_pdf.initialize()

        # 检查文档是否提供txt转换,不提供就无法翻译文档
        if not doc_pdf.is_extractable:
            Logger().write(self.fileName + '未能提取有效的文本,停止翻译。')
            return
        else:
            # 创建PDF资源管理器来共享资源
            rsrcmgr = PDFResourceManager()
            # 创建一个PDF参数分析器
            laparams = LAParams()
            # 创建聚合器
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            # 创建一个PDF页面解释器对象
            interpreter = PDFPageInterpreter(rsrcmgr, device)

            i = 0
            # 循环遍历列表,每次处理一页的内容
            for page in doc_pdf.get_pages():
                # 使用页面解释器来读取
                interpreter.process_page(page)
                # 使用聚合器获取内容
                layout = device.get_result()

                # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
                for out in layout:
                    # 判断是否含有get_text()方法,图片之类的就没有
                    if isinstance(out, LTTextBoxHorizontal):
                        content = out.get_text().strip()
                        if content:
                            to_trans_content = content.replace("\r\n", "")
                            ret = translate_func(to_trans_content)
                            trans = ret if ret else '翻译失败'

                            self.write(content)
                            self.write(trans)
                            i += 1
                            print(i, end=' ', flush=True)

                time.sleep(2)

            Logger().write(self.fileName + '翻译完成,新文档:' + self.new_fullPath)
コード例 #59
0
ファイル: pdf2word.py プロジェクト: gaofeifei/suanfalianxi
def parse():
    # rb以二进制读模式打开本地pdf文件
    rpath = r'C:\Users\hdp\Desktop\transformationPaper_EN_CN'
    # doc_name = r'2-1-5021-distributed-representations-of-words-and-phrases-and-their-compositionality.pdf'
    #doc_name = r'[3]Reducing the dimensionality of data with neural networks_science.pdf'
    # doc_name = r'[2]_Hinton,_Geoffrey_E,_Simon_Osindero,_and_Yee-Whye_Teh_A_fast_learning_algorithm_for_deep_belief_nets_Neural_computation_187_(2006)__1527-1554_.pdf'
    #seq2seq
    doc_name = r'[36]_Sutskever,_Ilya,_Oriol_Vinyals,_and_Quoc_V_Le_Sequence_to_sequence_learning_with_neural_networks_Advances_in_neural_information_processing_systems_2014_.pdf'
    p = rpath+'\\'+doc_name
    print(p)
    fn = open(p,'rb')
    # 创建一个pdf文档分析器
    parser = PDFParser(fn)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器 与文档对象
    parser.set_document(doc)
    doc.set_parser(parser)

    # 提供初始化密码doc.initialize("lianxipython")
    # 如果没有密码 就创建一个空的字符串
    doc.initialize("")
    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed

    else:
        # 创建PDf资源管理器
        resource = PDFResourceManager()
        # 创建一个PDF参数分析器
        laparams = LAParams()
        # 创建聚合器,用于读取文档的对象
        device = PDFPageAggregator(resource, laparams=laparams)
        # 创建解释器,对文档编码,解释成Python能够识别的格式
        interpreter = PDFPageInterpreter(resource, device)
        # 循环遍历列表,每次处理一页的内容
        # doc.get_pages() 获取page列表
        doc
        for page in doc.get_pages():
            # 利用解释器的process_page()方法解析读取单独页数
            interpreter.process_page(page)
            # 使用聚合器get_result()方法获取内容
            layout = device.get_result()
            # 这里layout是一个LTPage对象,里面存放着这个page解析出的各种对象
            for out in layout:
                # 判断是否含有get_text()方法,获取我们想要的文字
                if hasattr(out, "get_text"):
                    # print(out.get_text(), type(out.get_text()))
                    content = out.get_text().replace(u'\xa0', u' ')  # 将'\xa0'替换成u' '空格,这个\xa0就是&nbps空格
                    # with open('test.txt','a') as f:
                    #     f.write(out.get_text().replace(u'\xa0', u' ')+'\n')
                    document.add_paragraph(
                        content, style= 'List Number'#'ListBullet'  # 添加段落,样式为unordered list类型
                    )
                document.save(rpath+'\\'+doc_name.replace('.pdf','.docx'))  # 保存这个文档
コード例 #60
0
def parse(_path):
    fp = open(_path, 'rb')  # rb以二进制读模式打开本地pdf文件

    # fp = urlopen(_path) #打开在线PDF文档

    # 用文件对象来创建一个pdf文档分析器
    praser_pdf = PDFParser(fp)

    # 创建一个PDF文档
    doc = PDFDocument()

    # 连接分析器 与文档对象
    praser_pdf.set_document(doc)
    doc.set_parser(praser_pdf)

    # 提供初始化密码doc.initialize("123456")
    # 如果没有密码 就创建一个空的字符串
    doc.initialize()

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()

        # 创建一个PDF参数分析器
        laparams = LAParams()

        # 创建聚合器
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)

        # 创建一个PDF页面解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # 循环遍历列表,每次处理一页的内容
        # doc.get_pages() 获取page列表
        for page in doc.get_pages():
            # 使用页面解释器来读取
            interpreter.process_page(page)

            # 使用聚合器获取内容
            layout = device.get_result()

            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
            for out in layout:
                # 判断是否含有get_text()方法,图片之类的就没有
                # if hasattr(out,"get_text"):
                if isinstance(out, LTTextBoxHorizontal):

                    results = out.get_text()
                    print("results: " + results)
                    with open(r'C:\Users\qinxd\Desktop\test.txt', 'a') as f:
                        f.write(results + '\n')
                        f.close()