Ejemplo n.º 1
0
    def _cached_pages(self, target_page=-1):
        """
        Get a page or all pages from page generator, caching results.
        This is necessary because PDFMiner searches recursively for pages,
        so we won't know how many there are until we parse the whole document,
        which we don't want to do until we need to.
        """
        try:
            # pdfminer < 20131022
            self._pages_iter = self._pages_iter or self.doc.get_pages()
        except AttributeError:
            # pdfminer >= 20131022
            self._pages_iter = self._pages_iter or \
                PDFPage.create_pages(self.doc)

        if target_page >= 0:
            while len(self._pages) <= target_page:
                next_page = next(self._pages_iter)
                if not next_page:
                    return None
                next_page.page_number = 0
                self._pages += [next_page]
            try:
                return self._pages[target_page]
            except IndexError:
                return None
        self._pages += list(self._pages_iter)
        return self._pages
Ejemplo n.º 2
0
def convert_pdf_to_txt(path):

    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = "utf-8"
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, "rb")
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()
    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    strings = retstr.getvalue().replace("\n\n", "\n")
    retstr.close()
    try:
        with open(path[0:-3] + "txt", "w", encoding="utf-8",
                  errors="ignore") as f:
            f.write(strings)


##        print ("%s Writing Succeed!")
        f.close()
    except:
        print("Writing Failed!")
Ejemplo n.º 3
0
    def _cached_pages(self, target_page=-1):
        """
        Get a page or all pages from page generator, caching results.
        This is necessary because PDFMiner searches recursively for pages,
        so we won't know how many there are until we parse the whole document,
        which we don't want to do until we need to.
        """
        try:
            # pdfminer < 20131022
            self._pages_iter = self._pages_iter or self.doc.get_pages()
        except AttributeError:
            # pdfminer >= 20131022
            self._pages_iter = self._pages_iter or \
                PDFPage.create_pages(self.doc)

        if target_page >= 0:
            while len(self._pages) <= target_page:
                next_page = next(self._pages_iter)
                if not next_page:
                    return None
                next_page.page_number = 0
                self._pages += [next_page]
            try:
                return self._pages[target_page]
            except IndexError:
                return None
        self._pages += list(self._pages_iter)
        return self._pages
Ejemplo n.º 4
0
def extract_layout_by_page(pdf_path):
    """
    Extracts the layouts of the pages of a PDF document
    specified by pdf_path.

    Uses the PDFminer library. See its documentation for
    details of the objects returned.

    See:
    - https://euske.github.io/pdfminer/programming.html
    - http://denis.papathanasiou.org/posts/2010.08.04.post.html
    """
    laparams = LAParams()

    fp = open(pdf_path, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser)

    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    rsrcmgr = PDFResourceManager()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    layouts = []
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        layouts.append(device.get_result())

    return layouts
Ejemplo n.º 5
0
    def __init__(self, file, password='', just_text=1, check_extractable=True, char_margin=1.0, line_margin=0.1, word_margin=0.1):
        if sys.version_info.major > 2 and isinstance(password, str):
            password = password.encode()
        self.parser = PDFParser(file)
        self.laparams = LAParams(char_margin=char_margin, line_margin=line_margin, word_margin=word_margin)

        if PYTHON_3:
            self.doc = PDFDocument()
            self.parser.set_document(self.doc)
            self.doc.set_parser(self.parser)
            self.doc.initialize(password)
        else:
            self.doc = PDFDocument(self.parser, password)

        if not check_extractable or self.doc.is_extractable:
            self.resmgr = PDFResourceManager()
            self.device = TextConverter(self.resmgr, outfp=BytesIO(), laparams=self.laparams)
            self.interpreter = PDFPageInterpreter(
               self.resmgr, self.device)

            if PYTHON_3:
                page_generator = self.doc.get_pages()
            else:
                page_generator = PDFPage.create_pages(self.doc)

            for page in page_generator:
                self.append(self.interpreter.process_page(page))
            self.metadata = self.doc.info
        if just_text:
            self._cleanup()
Ejemplo n.º 6
0
Archivo: pdf10.py Proyecto: GodWord/pdf
def convert_pdf_2_text(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    with open(path, 'rb') as fp:
        for page in PDFPage.get_pages(fp, set()):
            interpreter.process_page(page)
        text = retstr.getvalue()
    device.close()
    retstr.close()
    return text
Ejemplo n.º 7
0
 def __init__(self, file, password='', just_text=1, check_extractable=True):
     self.parser = PDFParser(file)
     self.doc = PDFDocument(self.parser, password)
     if not check_extractable or self.doc.is_extractable:
         self.resmgr = PDFResourceManager()
         self.device = TextConverter(self.resmgr, outfp=StringIO())
         self.interpreter = PDFPageInterpreter(
            self.resmgr, self.device)
         for page in PDFPage.create_pages(self.doc):
             self.append(self.interpreter.process_page(page))
         self.metadata = self.doc.info
     if just_text:
         self._cleanup()
Ejemplo n.º 8
0
    def _initialize(self):
        self.cachedText = []
        for page in PDFPage.create_pages(self.document):
            retstr = StringIO()
            device = TextConverter(self.rsrcmgr,
                                   retstr,
                                   codec='ascii',
                                   laparams=LAParams())
            interpreter = PDFPageInterpreter(self.rsrcmgr, device)
            interpreter.process_page(page)
            self.cachedText.append(retstr.getvalue().lower())

        return self.cachedText
Ejemplo n.º 9
0
def extract_jpegs_from_pdf(fstream):
    """
    Reads a given PDF file and scans for the first valid embedded JPEG image.
    Returns either None (if none found) or a string of data for the image.
    There is no 100% guarantee for this code, yet it seems to work fine with most
    scanner-produced images around.
    More testing might be needed though.

    Note that in principle there is no serious problem extracting PNGs or other image types from PDFs,
    however at the moment I do not have enough test data to try this, and the one I have seems to be unsuitable
    for PDFMiner.

    :param fstream: Readable binary stream of the PDF
    :return: binary stream, containing the whole contents of the JPEG image or None if extraction failed.
    """
    parser = PDFParser(fstream)
    try:
        document = PDFDocument()
        parser.set_document(document)
        document.set_parser(parser)
        document.initialize('')
    except TypeError:
        document = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    device = PDFPageAggregator(rsrcmgr)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    try:
        pages = document.get_pages()
    except Exception:
        pages = PDFPage.create_pages(document)
    for page in pages:
        interpreter.process_page(page)
        layout = device.result
        for el in layout:
            if isinstance(el, LTFigure):
                for im in el:
                    if isinstance(im, LTImage):
                        # Found one!
                        try:
                            imdata = im.stream.get_data()
                        except Exception:
                            # Failed to decode (seems to happen nearly always - there's probably a bug in PDFMiner), oh well...
                            imdata = im.stream.get_rawdata()
                        if imdata is not None and imdata.startswith(
                                b'\xff\xd8\xff\xe0'):
                            yield imdata

    return None
Ejemplo n.º 10
0
def getPdffileBookmark2(filename, bookmark_file_savepath):
    #获得目录(纲要)
    # 打开一个pdf文件
    fp = open(filename, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser)
    pages = dict(
        (page.pageid, pageno)
        for (pageno, page) in enumerate(PDFPage.create_pages(document)))

    def resolve_dest(dest):
        if isinstance(dest, str):
            dest = resolve1(document.get_dest(dest))
        elif isinstance(dest, PSLiteral):
            dest = resolve1(document.get_dest(dest.name))
        if isinstance(dest, dict):
            dest = dest['D']
        return dest

    def e(s):
        ESC_PAT = re.compile(r'[\000-\037&<>()"\042\047\134\177-\377]')
        return ESC_PAT.sub(lambda m: '&#%d;' % ord(m.group(0)), s)

    outlines = document.get_outlines()
    bookmark = ''
    for (level, title, dest, a, se) in outlines:
        pageno = None
        error = 0
        if dest:
            try:
                dest = resolve_dest(dest)
                pageno = pages[dest[0].objid]
            except:
                error = 1
        elif a:
            action = a.resolve()
            if isinstance(action, dict):
                subtype = action.get('S')
                if subtype and repr(subtype) == '/GoTo' and action.get('D'):
                    dest = resolve_dest(action['D'])
                    pageno = pages[dest[0].objid]
        s = e(title).encode('utf-16', 'xmlcharrefreplace')
        # print(s)
        try:
            bookmark += '\t' * (level - 1) + title.strip() + '\t' + str(
                pageno + 1) + '\r\n'
        except:
            bookmark += '\t' * (level - 1) + title.strip() + '\txx' + '\r\n'

        # if error:
        #     # print(dest, pageno,'x')
        #     bookmark+='\t'*(level-1)+s+'\t'+str(pageno)+'xx\r\n'
        # else:
        #     # print(dest,pageno)
        #     bookmark+='\t'*(level-1)+s+'\t'+str(pageno)+'\r\n'
        # outfp.write('<outline level="%r" title="%s">\n' % (level, s))
        # if dest is not None:
        #     outfp.write('<dest>')
        #     dumpxml(outfp, dest)
        #     outfp.write('</dest>\n')
        # if pageno is not None:
        #     outfp.write('<pageno>%r</pageno>\n' % pageno)
        # outfp.write('</outline>\n')
    # print(bookmark)
    bookmark_file = codecs.open(bookmark_file_savepath, 'w', encoding='utf-16')
    bookmark_file.write(bookmark)
    bookmark_file.close()
Ejemplo n.º 11
0
#caching = False不缓存
            rsrcmgr = PDFResourceManager(caching=False)
            # 创建一个PDF设备对象
            laparams = LAParams()
            # 创建一个PDF页面聚合对象
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            #创建一个PDF解析器对象
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            #处理文档当中的每个页面

            # doc.get_pages() 获取page列表
            #for i, page in enumerate(document.get_pages()):
            #PDFPage.create_pages(document) 获取page列表的另一种方式
            replace = re.compile(r'\s+')
            # 循环遍历列表,每次处理一个page的内容
            for page in PDFPage.create_pages(document):
                interpreter.process_page(page)
                # 接受该页面的LTPage对象
                layout = device.get_result()
                # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象
                # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等
                for x in layout:
                    #如果x是水平文本对象的话
                    if (isinstance(x, LTTextBoxHorizontal)):
                        text = re.sub(replace, '', x.get_text())
                        if len(text) != 0:
                            w_file.write(text.encode('utf-8'))
                        #print text
                print("success")
        except:
            print("skip null file!")