Beispiel #1
0
def parsePDFtoTXT(pdf_path):
    fp = open(pdf_path, 'rb')
    parser = PDFParser(fp)

    # print(parser)

    document = PDFDocument(parser)
    print(document)
    parser.set_document(document)

    # print(parser.set_document(document))
    #
    # document.set_parser(parser)
    # document.initialize()
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in document.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            print(layout)
            output = str(layout)
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    text = x.get_text()
                    output += text
            with open('write.txt', 'a', encoding='utf-8') as f:
                f.write(output)
Beispiel #2
0
def parse(pdf_path):

    print(pdf_path)
    return

    fp = open(pdf_path, 'rb')  # 以二进制读模式打开
    # 用文件对象来创建一个pdf文档分析器
    parser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器 与文档对象
    parser.set_document(doc)
    doc.set_parser(parser)

    # 提供初始化密码
    # 如果没有密码 就创建一个空的字符串
    doc.initialize()

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf 资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # 用来计数页面,图片,曲线,figure,水平文本框等对象的数量
        num_page, num_image, num_curve, num_figure, num_TextBoxHorizontal = 0, 0, 0, 0, 0

        # 循环遍历列表,每次处理一个page的内容
        for page in doc.get_pages():  # doc.get_pages() 获取page列表
            num_page += 1  # 页面增一
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            for x in layout:
                if isinstance(x, LTImage):  # 图片对象
                    num_image += 1
                if isinstance(x, LTCurve):  # 曲线对象
                    num_curve += 1
                if isinstance(x, LTFigure):  # figure对象
                    num_figure += 1
                if isinstance(x, LTTextBoxHorizontal):  # 获取文本内容
                    num_TextBoxHorizontal += 1  # 水平文本框对象增一
                    # 保存文本内容
                    with open(r'test.doc', 'a',
                              encoding='utf-8') as f:  # 生成doc文件的文件名及路径
                        results = x.get_text()
                        f.write(results)
                        f.write('\n')
        print('对象数量:\n', '页面数:%s\n' % num_page, '图片数:%s\n' % num_image,
              '曲线数:%s\n' % num_curve, '水平文本框:%s\n' % num_TextBoxHorizontal)
Beispiel #3
0
def pdf_to_csv(filename):
    from cStringIO import StringIO
    from pdfminer.converter import LTChar, TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfparser import PDFParser
    from pdfminer.pdfdocument import PDFDocument
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter

    class CsvConverter(TextConverter):
        def __init__(self, *args, **kwargs):
            TextConverter.__init__(self, *args, **kwargs)

        def end_page(self, i):
            from collections import defaultdict
            lines = defaultdict(lambda: {})
            for child in self.cur_item._objs:  #<-- changed
                if isinstance(child, LTChar):
                    (_, _, x, y) = child.bbox
                    line = lines[int(-y)]
                    line[x] = child._text.encode(self.codec)  #<-- changed

            for y in sorted(lines.keys()):
                line = lines[y]
                self.outfp.write(";".join(line[x]
                                          for x in sorted(line.keys())))
                self.outfp.write("\n")

    # ... the following part of the code is a remix of the
    # convert() function in the pdfminer/tools/pdf2text module
    rsrc = PDFResourceManager()
    outfp = StringIO()
    device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())
    # becuase my test documents are utf-8 (note: utf-8 is the default codec)

    doc = PDFDocument()
    fp = open(filename, 'rb')
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')

    interpreter = PDFPageInterpreter(rsrc, device)

    for i, page in enumerate(doc.get_pages()):
        outfp.write("START PAGE %d\n" % i)
        if page is not None:
            interpreter.process_page(page)
        outfp.write("END PAGE %d\n" % i)

    device.close()
    fp.close()

    return outfp.getvalue()
Beispiel #4
0
def extract_first_jpeg_in_pdf(fstream):
    """
    Reads a given PDF file and scans for the first valid embedded JPEG image.
    Returns either None (if none found) or a string of data for the image.
    There is no 100% guarantee for this code, yet it seems to work fine with most
    scanner-produced images around.
    More testing might be needed though.

    Note that in principle there is no serious problem extracting PNGs or other image types from PDFs,
    however at the moment I do not have enough test data to try this, and the one I have seems to be unsuitable
    for PDFMiner.

    :param fstream: Readable binary stream of the PDF
    :return: binary stream, containing the whole contents of the JPEG image or None if extraction failed.
    """
    parser = PDFParser(fstream)
    if PY2:
        document = PDFDocument(parser)
    else:
        document = PDFDocument()
        parser.set_document(document)
        document.set_parser(parser)
        document.initialize('')
    rsrcmgr = PDFResourceManager()
    device = PDFPageAggregator(rsrcmgr)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = PDFPage.create_pages(document) if PY2 else document.get_pages()
    for page in pages:
        interpreter.process_page(page)
        layout = device.result
        for el in layout:
            if isinstance(el, LTFigure):
                for im in el:
                    if isinstance(im, LTImage):
                        # Found one!
                        st = None
                        try:
                            imdata = im.stream.get_data()
                        except:
                            # Failed to decode (seems to happen nearly always - there's probably a bug in PDFMiner), oh well...
                            imdata = im.stream.get_rawdata()
                        if imdata is not None and imdata.startswith(
                                b'\xff\xd8\xff\xe0'):
                            return imdata

    return None
Beispiel #5
0
    def __init__(self, *args, **kwargs):
        super(AccountRIB, self).__init__(*args, **kwargs)

        self.parsed_text = b''

        try:
            try:
                from pdfminer.pdfdocument import PDFDocument
                from pdfminer.pdfpage import PDFPage
                newapi = True
            except ImportError:
                from pdfminer.pdfparser import PDFDocument
                newapi = False
            from pdfminer.pdfparser import PDFParser, PDFSyntaxError
            from pdfminer.converter import TextConverter
            from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
        except ImportError:
            self.logger.warning('Please install python-pdfminer to get IBANs')
        else:
            parser = PDFParser(BytesIO(self.doc))
            try:
                if newapi:
                    doc = PDFDocument(parser)
                else:
                    doc = PDFDocument()
                    parser.set_document(doc)
                    doc.set_parser(parser)
            except PDFSyntaxError:
                return

            rsrcmgr = PDFResourceManager()
            out = BytesIO()
            device = TextConverter(rsrcmgr, out)
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            if newapi:
                pages = PDFPage.create_pages(doc)
            else:
                doc.initialize()
                pages = doc.get_pages()
            for page in pages:
                interpreter.process_page(page)

            self.parsed_text = out.getvalue()
Beispiel #6
0
    def __init__(self, *args, **kwargs):
        super(AccountRIB, self).__init__(*args, **kwargs)

        self.parsed_text = ''

        try:
            try:
                from pdfminer.pdfdocument import PDFDocument
                from pdfminer.pdfpage import PDFPage
                newapi = True
            except ImportError:
                from pdfminer.pdfparser import PDFDocument
                newapi = False
            from pdfminer.pdfparser import PDFParser, PDFSyntaxError
            from pdfminer.converter import TextConverter
            from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
        except ImportError:
            self.logger.warning('Please install python-pdfminer to get IBANs')
        else:
            parser = PDFParser(BytesIO(self.doc))
            try:
                if newapi:
                    doc = PDFDocument(parser)
                else:
                    doc = PDFDocument()
                    parser.set_document(doc)
                    doc.set_parser(parser)
            except PDFSyntaxError:
                return

            rsrcmgr = PDFResourceManager()
            out = BytesIO()
            device = TextConverter(rsrcmgr, out)
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            if newapi:
                pages = PDFPage.create_pages(doc)
            else:
                doc.initialize()
                pages = doc.get_pages()
            for page in pages:
                interpreter.process_page(page)

            self.parsed_text = out.getvalue()
Beispiel #7
0
def extract_text(data):
    try:
        try:
            from pdfminer.pdfdocument import PDFDocument
            from pdfminer.pdfpage import PDFPage
            newapi = True
        except ImportError:
            from pdfminer.pdfparser import PDFDocument
            newapi = False
        from pdfminer.pdfparser import PDFParser, PDFSyntaxError
        from pdfminer.converter import TextConverter
        from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    except ImportError:
        raise ImportError('Please install python3-pdfminer to parse PDF')
    else:
        parser = PDFParser(BytesIO(data))
        try:
            if newapi:
                doc = PDFDocument(parser)
            else:
                doc = PDFDocument()
                parser.set_document(doc)
                doc.set_parser(parser)
        except PDFSyntaxError:
            return

        rsrcmgr = PDFResourceManager()
        if sys.version_info.major == 2:
            out = BytesIO()
        else:
            out = StringIO()
        device = TextConverter(rsrcmgr, out)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        if newapi:
            pages = PDFPage.create_pages(doc)
        else:
            doc.initialize()
            pages = doc.get_pages()
        for page in pages:
            interpreter.process_page(page)

        return out.getvalue()
Beispiel #8
0
def parse():
    # rb以二进制读模式打开本地pdf文件
    fn = open('G:/机器学习1/gg.pdf', 'rb')
    # 创建一个pdf文档分析器
    parser = PDFParser(fn)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器 与文档对象
    parser.set_document(doc)
    doc.set_parser(parser)

    # 提供初始化密码doc.initialize("lianxipython")
    # 如果没有密码 就创建一个空的字符串
    doc.initialize("")
    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed

    else:
        # 创建PDf资源管理器
        resource = PDFResourceManager()
        # 创建一个PDF参数分析器
        laparams = LAParams()
        # 创建聚合器,用于读取文档的对象
        device = PDFPageAggregator(resource, laparams=laparams)
        # 创建解释器,对文档编码,解释成Python能够识别的格式
        interpreter = PDFPageInterpreter(resource, device)
        # 循环遍历列表,每次处理一页的内容
        # doc.get_pages() 获取page列表
        for page in doc.get_pages():
            # 利用解释器的process_page()方法解析读取单独页数
            interpreter.process_page(page)
            # 使用聚合器get_result()方法获取内容
            layout = device.get_result()
            # 这里layout是一个LTPage对象,里面存放着这个page解析出的各种对象
            for out in layout:
                # 判断是否含有get_text()方法,获取我们想要的文字
                if hasattr(out, "get_text"):
                    print(out.get_text())
                    with open('test.txt', 'a') as f:
                        f.write(out.get_text() + '\n')
Beispiel #9
0
def parse(InputPath, OutputPath):
    # rb以二进制读模式打开本地pdf文件
    fn = open(InputPath, 'rb')
    # 创建一个pdf文档分析器
    parser = PDFParser(fn)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器与文档对象
    parser.set_document(doc)
    doc.set_parser(parser)
    # 提供初始密码doc.initialize("lianxipython")
    # 如果没有密码,就创建一个空的字符串
    doc.initialize(" ")
    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDF资源管理器
        resource = PDFResourceManager()
        # 创建一个PDF参数分析器
        laparams = LAParams()
        # 创建聚合器,用于读取文档对象
        device = PDFPageAggregator(resource, laparams=laparams)
        # 创建解释器,对文档编码,解释成python能够识别的格式
        interpreter = PDFPageInterpreter(resource, device)
        # 循环遍历列表,每次处理一页内容
        # doc.get_pages()获取page列表
        pdfStr = ''
        for page in doc.get_pages():
            # 利用解释器的process_page()方法解析读取单独页数
            interpreter.process_page(page)
            # 使用聚合器get_result()方法获取内容
            layout = device.get_result()
            # 这里layout是一个LTPage对象,里面存放着这个page解析出来的各种对象
            for out in layout:
                # 判断是否含有get_text()方法,获取我们想要的文字
                if (isinstance(out, LTTextBoxHorizontal)):
                    pdfStr = pdfStr + out.get_text() + '\n'
            f = open(OutputPath, 'wb')
            f.write(pdfStr.encode())
Beispiel #10
0
def extract_text(data):
    try:
        try:
            from pdfminer.pdfdocument import PDFDocument
            from pdfminer.pdfpage import PDFPage
            newapi = True
        except ImportError:
            from pdfminer.pdfparser import PDFDocument
            newapi = False
        from pdfminer.pdfparser import PDFParser, PDFSyntaxError
        from pdfminer.converter import TextConverter
        from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    except ImportError:
        raise ImportError('Please install python-pdfminer to parse PDF')
    else:
        parser = PDFParser(BytesIO(data))
        try:
            if newapi:
                doc = PDFDocument(parser)
            else:
                doc = PDFDocument()
                parser.set_document(doc)
                doc.set_parser(parser)
        except PDFSyntaxError:
            return

        rsrcmgr = PDFResourceManager()
        out = BytesIO()
        device = TextConverter(rsrcmgr, out)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        if newapi:
            pages = PDFPage.create_pages(doc)
        else:
            doc.initialize()
            pages = doc.get_pages()
        for page in pages:
            interpreter.process_page(page)

        return out.getvalue()
Beispiel #11
0
def pdf_to_string(pdf_file):
    fp = open(pdf_file, 'rb')

    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()

    # Configuração das margens
    laparams = LAParams()
    laparams.line_margin = 0.3
    laparams.word_margin = 0.3
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)

    interpreter = PDFPageInterpreter(rsrcmgr, device)

    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            print(lt_obj)
Beispiel #12
0
def parse_pdf(path, output_path):
    with open(path, 'rb') as fp:
        parser = PDFParser(fp)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')
        rsrcmgr = PDFResourceManager()
        laparams = LAParams(all_texts=True,
                            boxes_flow=2.0,
                            heuristic_word_margin=True)
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        extracted_text = ''
        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBox) or isinstance(
                        lt_obj, LTTextLine):
                    extracted_text += lt_obj.get_text()
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(extracted_text)
Beispiel #13
0
    def get_PDFLayout(self, p):

        fp = open(self.path, 'rb')
        try:
            logging.propagate = False
            logging.getLogger().setLevel(logging.ERROR)
            parser = PDFParser(fp)
            document = PDFDocument()
            parser.set_document(document)

            document.set_parser(parser)
            rsrcmgr = PDFResourceManager()
            laparams = LAParams()
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            pages = list(document.get_pages())
            page_1 = pages[p]
            interpreter.process_page(page_1)
            layout = device.get_result()
        except:
            return -1
        fp.close()

        return layout
Beispiel #14
0
def pdf2csv(fp):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument(parser)
    print doc
    # Connect the parser and document objects.
    # parser.set_document(doc)
    # doc.set_parser(parser)
    # Supply the password for initialization.
    # (If no password is set, give an empty string.)
    doc.initialize('')
    # Check if the document allows text extraction. If not, abort.
    # if not doc.is_extractable:
    #     raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Set parameters for analysis.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    layout = device.get_result()

    for pageno, page in enumerate(doc.get_pages()):
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        #import code; code.interact(local=locals());
        hlines = []
        vlines = []
        for i in layout:
            if not type(i) in (LTRect, LTLine): continue
            hlines.append(int(i.x0))
            hlines.append(int(i.x1))
            vlines.append(int(layout.height - i.y0))
            vlines.append(int(layout.height - i.y1))
        hlines = filterclose(sorted(set(hlines)))
        vlines = filterclose(sorted(set(vlines)))
        print hlines
        print vlines
        print(layout.width, layout.height)
        i = 0
        im = Image.new('1', (int(layout.width), int(layout.height)))
        draw = ImageDraw.Draw(im)
        while (i < len(vlines) - 1):
            if not vlines[i + 1] - vlines[i] > 5:
                i = i + 1
                continue
            j = 0
            while (j < len(hlines) - 1):
                if not hlines[j + 1] - hlines[j] > 5:
                    j = j + 1
                    continue
                draw.rectangle([(int(hlines[j]), int(vlines[i])),
                                (int(hlines[j + 1]), int(vlines[i + 1]))],
                               outline=1)
                j = j + 1
            i = i + 1
        del draw
        fp = open("out%s.png" % pageno, 'wb')
        im.save(fp, "PNG")
        fp.close()
Beispiel #15
0
def get_pdf_rows(data, miner_layout=True):
    """
    Takes PDF file content as string and yield table row data for each page.

    For each page in the PDF, the function yields a list of rows.
    Each row is a list of cells. Each cell is a list of strings present in the cell.
    Note that the rows may belong to different tables.

    There are no logic tables in PDF format, so this parses PDF drawing instructions
    and tries to find rectangles and arrange them in rows, then arrange text in
    the rectangles.

    External dependencies:
    PDFMiner (http://www.unixuser.org/~euske/python/pdfminer/index.html).
    """

    try:
        from pdfminer.pdfparser import PDFParser, PDFSyntaxError
    except ImportError:
        raise ImportError('Please install python-pdfminer')

    try:
        from pdfminer.pdfdocument import PDFDocument
        from pdfminer.pdfpage import PDFPage
        newapi = True
    except ImportError:
        from pdfminer.pdfparser import PDFDocument
        newapi = False
    from pdfminer.converter import PDFPageAggregator
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.layout import LAParams, LTRect, LTTextBox, LTTextLine, LTLine, LTChar, LTCurve

    parser = PDFParser(BytesIO(data))
    try:
        if newapi:
            doc = PDFDocument(parser)
        else:
            doc = PDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
    except PDFSyntaxError:
        return

    rsrcmgr = PDFResourceManager()
    if miner_layout:
        device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
    else:
        device = PDFPageAggregator(rsrcmgr)

    interpreter = PDFPageInterpreter(rsrcmgr, device)
    if newapi:
        pages = PDFPage.get_pages(BytesIO(data), check_extractable=True)
    else:
        doc.initialize()
        pages = doc.get_pages()

    if LOGGER.isEnabledFor(DEBUGFILES):
        import tempfile
        import PIL.Image as Image
        import PIL.ImageDraw as ImageDraw
        import random

        path = tempfile.mkdtemp(prefix='pdf')

    for npage, page in enumerate(pages):
        LOGGER.debug('processing page %s', npage)
        interpreter.process_page(page)
        page_layout = device.get_result()

        texts = sum([list(lttext_to_multilines(obj, page_layout)) for obj in page_layout._objs if isinstance(obj, (LTTextBox, LTTextLine, LTChar))], [])
        LOGGER.debug('found %d text objects', len(texts))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for t in texts:
                color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255))
                draw.rectangle((t.x0, t.y0, t.x1, t.y1), outline=color)
                draw.text((t.x0, t.y0), t.text.encode('utf-8'), color)
            fpath = '%s/1text-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        if not miner_layout:
            texts.sort(key=lambda t: (t.y0, t.x0))

        # TODO filter ltcurves that are not lines?
        # TODO convert rects to 4 lines?
        lines = [lt_to_coords(obj, page_layout) for obj in page_layout._objs if isinstance(obj, (LTRect, LTLine, LTCurve))]
        LOGGER.debug('found %d lines', len(lines))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for l in lines:
                color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255))
                draw.rectangle((l.x0, l.y0, l.x1, l.y1), outline=color)
            fpath = '%s/2lines-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        lines = list(uniq_lines(lines))
        LOGGER.debug('found %d unique lines', len(lines))

        rows = build_rows(lines)
        LOGGER.debug('built %d rows (%d boxes)', len(rows), sum(len(row) for row in rows))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for r in rows:
                for b in r:
                    color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255))
                    draw.rectangle((b.x0 + 1, b.y0 + 1, b.x1 - 1, b.y1 - 1), outline=color)
            fpath = '%s/3rows-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        textrows = arrange_texts_in_rows(rows, texts)
        LOGGER.debug('assigned %d strings', sum(sum(len(c) for c in r) for r in textrows))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for row, trow in zip(rows, textrows):
                for b, tlines in zip(row, trow):
                    color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255))
                    draw.rectangle((b.x0 + 1, b.y0 + 1, b.x1 - 1, b.y1 - 1), outline=color)
                    draw.text((b.x0 + 1, b.y0 + 1), '\n'.join(tlines).encode('utf-8'), color)
            fpath = '%s/4cells-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        yield textrows
    device.close()
Beispiel #16
0
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator

fp = open("Lista_samurai_x.pdf", "rb")

parser = PDFParser(fp)

doc = PDFDocument(parser)
parser.set_document(doc)
doc.set_parser(parser)
doc.initialize("")

rsrcmgr = PDFResourceManager()

laparamns = LAParams()
laparamns.line_margin = 0.3
laparamns.word_margin = 0.3

device = PDFPageAggregator(rsrcmgr, laparamns=laparamns)
interpreter = PDFPageInterpreter(rsrcmgr, device)

for page in doc.get_pages():
    interpreter.process_page(page)
    layout = device.get_result()
    for ltobject in layout:
        print(ltobject.get_text())
Beispiel #17
0
def get_pdf_rows(data, miner_layout=True):
    """
    Takes PDF file content as string and yield table row data for each page.

    For each page in the PDF, the function yields a list of rows.
    Each row is a list of cells. Each cell is a list of strings present in the cell.
    Note that the rows may belong to different tables.

    There are no logic tables in PDF format, so this parses PDF drawing instructions
    and tries to find rectangles and arrange them in rows, then arrange text in
    the rectangles.

    External dependencies:
    PDFMiner (http://www.unixuser.org/~euske/python/pdfminer/index.html).
    """

    try:
        from pdfminer.pdfparser import PDFParser, PDFSyntaxError
    except ImportError:
        raise ImportError('Please install python-pdfminer')

    try:
        from pdfminer.pdfdocument import PDFDocument
        from pdfminer.pdfpage import PDFPage
        newapi = True
    except ImportError:
        from pdfminer.pdfparser import PDFDocument
        newapi = False
    from pdfminer.converter import PDFPageAggregator
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.layout import LAParams, LTRect, LTTextBox, LTTextLine, LTLine, LTChar, LTCurve

    parser = PDFParser(BytesIO(data))
    try:
        if newapi:
            doc = PDFDocument(parser)
        else:
            doc = PDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
    except PDFSyntaxError:
        return

    rsrcmgr = PDFResourceManager()
    if miner_layout:
        device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
    else:
        device = PDFPageAggregator(rsrcmgr)

    interpreter = PDFPageInterpreter(rsrcmgr, device)
    if newapi:
        pages = PDFPage.get_pages(BytesIO(data), check_extractable=True)
    else:
        doc.initialize()
        pages = doc.get_pages()

    if LOGGER.isEnabledFor(DEBUGFILES):
        import tempfile
        import PIL.Image as Image
        import PIL.ImageDraw as ImageDraw
        import random

        path = tempfile.mkdtemp(prefix='pdf')

    for npage, page in enumerate(pages):
        LOGGER.debug('processing page %s', npage)
        interpreter.process_page(page)
        page_layout = device.get_result()

        texts = sum([list(lttext_to_multilines(obj, page_layout)) for obj in page_layout._objs if isinstance(obj, (LTTextBox, LTTextLine, LTChar))], [])
        LOGGER.debug('found %d text objects', len(texts))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for t in texts:
                color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255))
                draw.rectangle((t.x0, t.y0, t.x1, t.y1), outline=color)
                draw.text((t.x0, t.y0), t.text.encode('utf-8'), color)
            fpath = '%s/1text-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        if not miner_layout:
            texts.sort(key=lambda t: (t.y0, t.x0))

        # TODO filter ltcurves that are not lines?
        # TODO convert rects to 4 lines?
        lines = [lt_to_coords(obj, page_layout) for obj in page_layout._objs if isinstance(obj, (LTRect, LTLine, LTCurve))]
        LOGGER.debug('found %d lines', len(lines))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for l in lines:
                color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255))
                draw.rectangle((l.x0, l.y0, l.x1, l.y1), outline=color)
            fpath = '%s/2lines-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        lines = list(uniq_lines(lines))
        LOGGER.debug('found %d unique lines', len(lines))

        rows = build_rows(lines)
        LOGGER.debug('built %d rows (%d boxes)', len(rows), sum(len(row) for row in rows))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for r in rows:
                for b in r:
                    color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255))
                    draw.rectangle((b.x0 + 1, b.y0 + 1, b.x1 - 1, b.y1 - 1), outline=color)
            fpath = '%s/3rows-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        textrows = arrange_texts_in_rows(rows, texts)
        LOGGER.debug('assigned %d strings', sum(sum(len(c) for c in r) for r in textrows))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for row, trow in zip(rows, textrows):
                for b, tlines in zip(row, trow):
                    color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255))
                    draw.rectangle((b.x0 + 1, b.y0 + 1, b.x1 - 1, b.y1 - 1), outline=color)
                    draw.text((b.x0 + 1, b.y0 + 1), '\n'.join(tlines).encode('utf-8'), color)
            fpath = '%s/4cells-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        yield textrows
    device.close()
Beispiel #18
0
#coding=utf-8
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
import LAParams
import PDFPageAggregator
fp = open('/home/zzq/learngit/pdf_document/php.pdf')#打开文件
parser=PDFParser(fp)#解析器
doc =PDFDocument()#文档
doc.set_parser(parser)#设置解析器
doc.initialize("")#初始化
resource=PDFResourceManager()#资源管理器
laparams=LAParams()#参数分析期
#聚合器
device=PDFPageAggregator()
#页面解析器
interpreter=PDFPageInterpreter(resource,device)

for page in doc.get_pages():
	interpreter.process_page(page)
	layout=device.get_result()
	for out in layout:
		print out.get_text()
Beispiel #19
0
document.set_parser(parser)

#  初始化文档密码
document.initialize()
if document.is_extractable:
    print(True)
else:
    raise PDFTextExtractionNotAllowed
#  存储文档资源
src = PDFResourceManager()

#  设备对象
device = PDFPageAggregator(src, laparams=LAParams())

#  解释器对象

inter = PDFPageInterpreter(src, device)

pages = document.get_pages()

for page in pages:
    # print(page.contents)
    inter.process_page(page)
    layout = device.get_result()
    for x in layout:
        if isinstance(x, LTTextBoxHorizontal):
            print(str(x.get_text()))
        # t = dir(x)
        # print(t)
        # print(type(x))
Beispiel #20
0
def pdf2csv(fp):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument(parser)
    print doc
    # Connect the parser and document objects.
    # parser.set_document(doc)
    # doc.set_parser(parser)
    # Supply the password for initialization.
    # (If no password is set, give an empty string.)
    doc.initialize('')
    # Check if the document allows text extraction. If not, abort.
    # if not doc.is_extractable:
    #     raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Set parameters for analysis.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    layout = device.get_result()
    
  
    for pageno, page in enumerate(doc.get_pages()):
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        #import code; code.interact(local=locals());
        hlines=[]
        vlines=[]
        for i in layout:
            if not type(i) in (LTRect, LTLine): continue
            hlines.append(int(i.x0))
            hlines.append(int(i.x1))
            vlines.append(int(layout.height - i.y0))
            vlines.append(int(layout.height - i.y1))
        hlines=filterclose(sorted(set(hlines)))
        vlines=filterclose(sorted(set(vlines)))
        print hlines
        print vlines
        print (layout.width, layout.height)
        i=0
        im = Image.new('1', (int(layout.width), int(layout.height)))
        draw = ImageDraw.Draw(im)
        while(i<len(vlines)-1):
            if not vlines[i+1]-vlines[i]>5:
                i=i+1
                continue
            j=0
            while(j<len(hlines)-1):
                if not hlines[j+1]-hlines[j]>5:
                    j=j+1
                    continue
                draw.rectangle([(int(hlines[j]),int(vlines[i])),(int(hlines[j+1]),int(vlines[i+1]))], outline=1)
                j=j+1
            i=i+1
        del draw
        fp=open("out%s.png" % pageno,'wb')
        im.save(fp,"PNG")
        fp.close()
Beispiel #21
0
def get_pdf_rows(data, miner_layout=True):
    """
    Takes PDF file content as string and yield table row data for each page.

    For each page in the PDF, the function yields a list of rows.
    Each row is a list of cells. Each cell is a list of strings present in the cell.
    Note that the rows may belong to different tables.

    There are no logic tables in PDF format, so this parses PDF drawing instructions
    and tries to find rectangles and arrange them in rows, then arrange text in
    the rectangles.

    External dependencies:
    PDFMiner (http://www.unixuser.org/~euske/python/pdfminer/index.html).
    """

    try:
        from pdfminer.pdfparser import PDFParser, PDFSyntaxError
    except ImportError:
        raise ImportError('Please install python-pdfminer')

    try:
        from pdfminer.pdfdocument import PDFDocument
        from pdfminer.pdfpage import PDFPage
        newapi = True
    except ImportError:
        from pdfminer.pdfparser import PDFDocument
        newapi = False
    from pdfminer.converter import PDFPageAggregator
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.layout import LAParams, LTRect, LTTextBox, LTTextLine, LTLine, LTChar

    parser = PDFParser(BytesIO(data))
    try:
        if newapi:
            doc = PDFDocument(parser)
        else:
            doc = PDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
    except PDFSyntaxError:
        return

    rsrcmgr = PDFResourceManager()
    if miner_layout:
        device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
    else:
        device = PDFPageAggregator(rsrcmgr)

    interpreter = PDFPageInterpreter(rsrcmgr, device)
    if newapi:
        pages = PDFPage.get_pages(BytesIO(data), check_extractable=True)
    else:
        doc.initialize()
        pages = doc.get_pages()

    for npage, page in enumerate(pages):
        interpreter.process_page(page)
        page_layout = device.get_result()

        texts = sum([
            list(lttext_to_multilines(obj, page_layout))
            for obj in page_layout._objs
            if isinstance(obj, (LTTextBox, LTTextLine, LTChar))
        ], [])
        if not miner_layout:
            texts.sort(key=lambda t: (t.y0, t.x0))

        lines = list(
            uniq_lines(
                lt_to_coords(obj, page_layout) for obj in page_layout._objs
                if isinstance(obj, (LTRect, LTLine))))

        boxes = build_rows(lines)
        textrows = arrange_texts_in_rows(boxes, texts)

        yield textrows
    device.close()