Example #1
0
def main(argv):

    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = False
    laparams = LAParams()
    using_optparse = False

    parser = ArgumentParser(prog='pdf2txt.py',
                            description='Convert pdf to txt',
                            formatter_class=ArgumentDefaultsHelpFormatter)

    if using_optparse:
        DEBUG(3, 'using optparse')
        parser.add_argument = parser.add_option
        parser.parse_known_args = parser.parse_args
        parser.disable_interspersed_args()

    parser.add_argument('-d',
                        dest='debuglevel',
                        action='count',
                        default=0,
                        help='Debug (repeat for more verbose debugging)')

    parser.add_argument(
        '-p',
        '--pages',
        dest='pagenos',
        action='store',
        type=str,
        default='',
        help=
        'Specifies the comma-separated list of the page numbers to be extracted. Page numbers start at one. By default, it extracts text from all the pages.'
    )

    parser.add_argument('-c',
                        '--codec',
                        dest='codec',
                        action='store',
                        type=str,
                        default='utf-8',
                        help='Specifies the output codec.')

    parser.add_argument(
        '-t',
        '--type',
        dest='outtype',
        action='store',
        type=str,
        default='shape',
        choices=['text', 'html', 'xml', 'tag', 'shape'],
        help='Specifies the output format, one of: shape, text, html, xml, tag'
    )

    parser.add_argument(
        '-m',
        dest='maxpages',
        action='store',
        type=int,
        default=0,
        help=
        'Specifies the maximum number of pages to extract. By default (0), it extracts all the pages in a document.'
    )

    parser.add_argument(
        '-P',
        '--password',
        dest='password',
        action='store',
        type=str,
        default='',
        help='Provides the user password to access PDF contents.')

    parser.add_argument(
        '-o',
        '--output',
        dest='outfile',
        action='store',
        type=str,
        default=None,
        help=
        'Specifies the output file name. By default, it prints the extracted contents to stdout in text format.'
    )

    parser.add_argument(
        '-C',
        '--no-caching',
        dest='caching',
        action='store_false',
        default=True,
        help=
        'Suppress object caching. This will reduce the memory consumption but also slows down the process.'
    )

    parser.add_argument('-n',
                        '--no-layout',
                        dest='layout',
                        action='store_false',
                        default=True,
                        help='Suppress layout analysis.')

    parser.add_argument('--show-pageno',
                        dest='show_pageno',
                        action='store_true',
                        default=False,
                        help='Show page numbers.')

    parser.add_argument(
        '-A',
        '--analyze-all',
        dest='all_texts',
        action='store_true',
        default=False,
        help=
        'Forces to perform layout analysis for all the text strings, including text contained in figures.'
    )

    parser.add_argument('-V',
                        '--detect-vertical',
                        dest='detect_vertical',
                        action='store_true',
                        default=False,
                        help='Allows vertical writing detection.')

    parser.add_argument(
        '-M',
        dest='char_margin',
        action='store',
        type=float,
        default=2.0,
        help=
        'Two text chunks whose distance is closer than the char_margin (shown as M) is considered continuous and get grouped into one.'
    )

    parser.add_argument(
        '-L',
        dest='line_margin',
        action='store',
        type=float,
        default=0.5,
        help=
        'Two lines whose distance is closer than the line_margin (L) is grouped as a text box, which is a rectangular area that contains a "cluster" of text portions.'
    )

    parser.add_argument(
        '-W',
        dest='word_margin',
        action='store',
        type=float,
        default=0.1,
        help=
        'It may be required to insert blank characters (spaces) as necessary if the distance between two words is greater than the word_margin (W), as a blank between words might not be represented as a space, but indicated by the positioning of each word.'
    )

    parser.add_argument(
        '-F',
        dest='boxes_flow',
        action='store',
        type=float,
        default=0.5,
        help=
        'Specifies how much a horizontal and vertical position of a text matters when determining a text order. The value should be within the range of -1.0 (only horizontal position matters) to +1.0 (only vertical position matters).'
    )

    parser.add_argument(
        '-Y',
        '--layout-mode',
        dest='layoutmode',
        action='store',
        type=str,
        default='normal',
        choices=['exact', 'normal', 'loose'],
        help=
        'Specifies how the page layout should be preserved. (Currently only applies to HTML format.) One of: exact, normal, loose.'
    )

    parser.add_argument('-O',
                        '--image-writer',
                        dest='imagewriter',
                        action='store',
                        type=str,
                        default=None,
                        help='imagewriter')

    parser.add_argument('-R',
                        '--rotation',
                        dest='rotation',
                        action='store',
                        type=int,
                        default=0,
                        help='rotation')

    parser.add_argument('-S',
                        '--strip-control',
                        dest='stripcontrol',
                        action='store_true',
                        default=False,
                        help='stripcontrol')

    parser.add_argument(
        '-s',
        dest='scale',
        action='store',
        type=float,
        default=1,
        help='Specifies the output scale. Can be used in HTML format only.')

    parser.add_argument(
        '--draw-lines',
        dest='draw_lines',
        action='store_true',
        help=
        "Draw crude page representation, coloured TextLines (= short pieces of text). Valid only for the `shape' output."
    )

    parser.add_argument(
        '--draw-boxes',
        dest='draw_boxes',
        action='store_true',
        help=
        "Draw crude page representation, coloured TextBoxes (= grouped text lines). Valid only for the `shape' output."
    )

    parser.add_argument(
        '--draw-blocks',
        dest='draw_blocks',
        action='store_true',
        help=
        "Draw crude page representation, coloured TextBlocks (= grouped TextBoxes). Valid only for the `shape' output."
    )

    parser.add_argument(
        '--shear-limit',
        dest='shear_limit',
        action='store',
        default=0.1,
        type=float,
        help=
        "If the text is sheared above this limit, reject it. Valid only for the `shape' output."
    )

    parser.add_argument(
        '--rotation-limit',
        dest='rotation_limit',
        action='store',
        default=2,
        type=float,
        help=
        "If the text is rotated above this angle (in degrees), reject it. Valid only for the `shape' output."
    )

    parser.add_argument(
        '--line-height-diff',
        dest='line_height_diff',
        action='store',
        type=float,
        default=0.1,
        help=
        'Two lines whose vertical sizes differ more than this ratio are not to be considered of the same paragraph (but e.g. one of them is a heading).'
    )

    parser.add_argument('--heading-before',
                        dest='heading_before',
                        action='store',
                        type=str,
                        default='',
                        help='String to put before each heading, e.g. <h1>')

    parser.add_argument('--heading-after',
                        dest='heading_after',
                        action='store',
                        type=str,
                        default='',
                        help='String to put after each heading, e.g. </h1>')

    parser.add_argument(
        '--box-separator',
        dest='box_separator',
        action='store',
        type=str,
        default=r'\n\n',
        help=
        r'Separate boxes with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.'
    )

    parser.add_argument(
        '--block-separator',
        dest='block_separator',
        action='store',
        type=str,
        default=r'\n\n',
        help=
        r'Separate blocks with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.'
    )

    parser.add_argument(
        '--indent-separator',
        dest='indent_separator',
        action='store',
        type=str,
        default=r'\n\n',
        help=
        r'Separate indented lines with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.'
    )

    parser.add_argument(
        '--indent-string',
        dest='indent_string',
        action='store',
        type=str,
        default=r'\t',
        help=
        r'Put this string in front of indented lines. Use \n for new line, \t for TAB, other escape sequences are not recognized.'
    )

    parser.add_argument(
        '--indent-limit',
        dest='indent_limit',
        action='store',
        type=float,
        default=3,
        help=
        'If the line is indented more then this (approximately characters), it will separated by --indent-separator from the previous one.'
    )

    parser.add_argument(
        '--page-separator',
        dest='page_separator',
        action='store',
        type=str,
        default=r'\n\n',
        help=
        r'Separate pages with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.'
    )

    parser.add_argument(
        '--norm-whitespace',
        dest='norm_whitespace',
        action='store_true',
        default=False,
        help=
        'Normalize whitespace (remove duplicate spaces, replace end of lines with spaces).'
    )

    parser.add_argument(
        '--print-stats',
        dest='print_stats',
        action='store_true',
        default=False,
        help=
        'Instead of the text, output some simple statistics about the file.')

    parser.add_argument(
        '--max-blocks',
        dest='max_blocks',
        action='store',
        default=0,
        type=int,
        help=
        'If there is more than this blocks per page, do not return any text. Use to discriminate abnormal files (run --print-stats first to find out the number of boxes per "normal" file). 0 means no limit. 50 is maybe a good value.'
    )

    parser.add_argument(
        '--max-textlines',
        dest='max_textlines',
        action='store',
        default=0,
        type=int,
        help=
        'If there is more than this textlines per any block, do not return any text. Use to discriminate abnormal files (run --print-stats first to find out the number of boxes per "normal" page). 0 means no limit. 18 is maybe a good value.'
    )

    parser.add_argument(
        '--line-height-method',
        dest='line_height_method',
        action='store',
        type=str,
        default='bbox',
        choices=['bbox', 'mean', 'median'],
        help=
        'Method to calculate height of line (relevant if there are characters with uneven height). bbox takes the bounding box (rectangle encompassing the line), mean the arithmetic mean of the height of all the characters, median is the median of the height of all the characters. Use mean or median if there are outlier characters, e.g. one big character at the beginning of line.'
    )

    parser.add_argument(dest='pdffile',
                        help='List of PDF files to go through',
                        default=None,
                        nargs='+')

    args, rest = parser.parse_known_args()

    global debuglevel
    debuglevel = debug = args.debuglevel
    DEBUG(3, 'args:', str(args))
    DEBUG(3, 'rest:', str(rest))

    DEBUG(3, 'optparse:', using_optparse)

    if args.pagenos:
        pagenos.update(int(x) - 1 for x in args.pagenos.split(','))
    maxpages = args.maxpages
    outfile = args.outfile
    password = args.password
    caching = args.caching
    showpageno = args.show_pageno
    if not args.layout:
        laparams = None
    if laparams and args.all_texts:
        laparams.all_texts = True
    if laparams and args.detect_vertical:
        laparams.detect_vertical = True
    if laparams:
        laparams.char_margin = args.char_margin
        laparams.line_margin = args.line_margin
        laparams.word_margin = args.word_margin
        laparams.boxes_flow = args.boxes_flow
    layoutmode = args.layoutmode

    if args.imagewriter:
        imagewriter = ImageWriter(args.imagewriter)

    rotation = args.rotation
    stripcontrol = args.stripcontrol
    outtype = args.outtype
    codec = args.codec
    scale = args.scale

    args.box_separator = unescape_string(args.box_separator)
    args.block_separator = unescape_string(args.block_separator)
    args.indent_separator = unescape_string(args.indent_separator)
    args.indent_string = unescape_string(args.indent_string)

    args.page_separator = unescape_string(args.page_separator)

    global options
    options = args

    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug

    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
        DEBUG(2, 'output goes to', outfile)
    else:
        outfp = sys.stdout
        DEBUG(2, 'output goes to stdout')
    if outtype == 'shape':
        device = ShapeTextConverter(rsrcmgr,
                                    outfp,
                                    codec=codec,
                                    laparams=laparams,
                                    showpageno=showpageno,
                                    imagewriter=imagewriter)
    elif outtype == 'text':
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              codec=codec,
                              laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=stripcontrol)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter,
                               debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in options.pdffile:
        DEBUG(2, 'processing', fname)
        fp = file(fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp,
                                      pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            page.rotate = (page.rotate + rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()

    outfp.close()
    DEBUG(2, 'finished.')

    return
Example #2
0
#pip install pdf miner
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter, HTMLConverter, XMLConverter
from pdfminer.layout import LAParams
import io

pdf_path = 'C:\\Users\somepath\filename.pdf'  #path to your pdf file

pdf = open(pdf_path, 'rb')
mem = io.StringIO()

lp = LAParams()
rm = PDFResourceManager()
cnv = TextConverter(rm, mem, laparams=lp)
ip = PDFPageInterpreter(rm, cnv)

for i in PDFPage.get_pages(pdf):
    ip.process_page(i)
    text = mem.getvalue()

file = open("F:\\AIB\\covertedtext.txt", 'wb')  #path to your destination file
file.write(text.encode('utf-8'))

print("DONE")
Example #3
0
    def parse(self, path, filename):
        print('----------------------------------------------------------')
        print('查找文档:' + filename)

        writepath = self.copy_excel('s' + filename.rsplit('.', 1)[0])
        wb = copy(xlrd.open_workbook(writepath, formatting_info=True))
        ws = wb.get_sheet(0)

        fp = open(path, 'rb')  # 以二进制读模式打开
        # 用文件对象来创建一个pdf文档分析器
        praser = PDFParser(fp)
        # 创建一个PDF文档
        doc = PDFDocument()
        # 连接分析器 与文档对象
        praser.set_document(doc)
        doc.set_parser(praser)
        # 提供初始化密码
        # 如果没有密码 就创建一个空的字符串
        doc.initialize()

        # 检测文档是否提供txt转换,不提供就忽略
        if not doc.is_extractable:
            raise PDFTextExtractionNotAllowed
        else:
            # 创建PDF 资源管理器 来管理共享资源
            rsrcmgr = PDFResourceManager()
            # 创建一个PDF设备对象
            laparams = LAParams()
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            # 创建一个PDF解释器对象
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            # 设置参数
            # 循环遍历列表,每次处理一个page的内容
            flag = False
            count = 0
            _page_finance = 0
            _page_agency = 0
            _page_overview = 0
            _page_money = 0
            _count__read_page = 0
            layouts = []
            _mat_shiyi = False
            for page in doc.get_pages():  # doc.get_pages() 获取page列表
                if _count__read_page == 3:
                    ws = wb.get_sheet(7)
                    self.findPartyConcernedMsg(layouts, writepath, wb, ws)
                    break
                interpreter.process_page(page)
                layout = device.get_result()
                count += 1
                if _count__read_page != 0:
                    _count__read_page += 1
                    layouts.append(layout)
                    continue
                # _read_row = 0

                for x in layout:
                    if isinstance(x, LTTextBoxHorizontal):
                        # _read_row += 1
                        results = x.get_text().replace(" ",
                                                       "").replace('\n',
                                                                   '').strip()
                        if self.locationYW(results):
                            print(results)
                        # if re.match(r'[\w\W]*第一节[\W]*释义', results):
                        #     _mat_shiyi = True
                        # if not _mat_shiyi and _read_row > 5:
                        #     break
                        # if filename == '晶丰明源.pdf':
                        #     print(results)
                        if re.match('[\w\W]*目录+[\w\W]*', results):
                            break
                        if self.locationOverview(results):
                            print('找到概览,在第 %d 页' % count)
                            _page_overview = count
                            self.findOverviewMsf(layout, writepath, wb, ws)
                            break
                        if self.findPartyConcerned(results):
                            print('找到中介机构,在第 %d 页 ' % count)
                            _count__read_page += 1
                            layouts.append(layout)
                            _page_agency = count
                            self.split_pdf(_page_finance, _page_money,
                                           _page_agency, _page_overview, path,
                                           self.create_spdf_dir(filename))
                            break
                        mat = re.search(r'[二|三|四|五]+\、[\w\W]*主要财务数据[\w]*',
                                        results)
                        _mat = re.search(r'[\w\W]+\、募集资金[\w\W]*(用途|运用)[\w\W]*',
                                         results)
                        if _mat is not None:
                            _page_money = count
                            print("找到募集资金 在第 %d 页" % count)
                        # 募集资金用途:在第三节财务数据的后边。

                        if mat is not None:
                            if self.findCurrentLiabilities(layout):
                                # locationSuccess = True
                                _page_finance = count
                                # split_pdf(count, path, create_spdf_dir(filename))
                                print('找到财务信息,在第 %d 页' % count)
                                break
                            else:
                                flag = True
                                break  # 结束当前页
                        else:
                            if flag:
                                if self.findCurrentLiabilities(layout):
                                    # locationSuccess = True
                                    _page_finance = count
                                    # split_pdf(count, path, create_spdf_dir(filename))
                                    print('找到财务信息,第 %d 页' % count)
                                    break
                                else:
                                    flag = False
def get_report_startpage(pdf):
    """获取财务报表在文件内的起始页
    Arguments:
        pdf {[str]} -- [pdf文件路径]
    Returns:
        start_page[int] -- [业务报表的起始页]
    """
    getp= pdfplumber.open(pdf)
    total=len(getp.pages)
    #用于判断当前页是否在前10页
    count=0
    #存储报表的起始页
    start_page=1
    #是否是年度报告之类的文件标志
    flag=False
    #创建一个pdf资源管理对象,存储共享资源
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    codec = 'utf-8'
    outfp = StringIO()
    #创建device对象
    device = TextConverter(rsrcmgr=rsrcmgr, outfp=outfp, codec=codec, laparams=laparams)
    if total>30:
        print('总页数',total)
        with open(pdf, 'rb') as fp:
            #处理页面内容
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            password = ""
            maxpages = 0
            caching = True
            pagenos=set()
            #遍历pdf中的每一页
            for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
                count+=1
                teststr=''
                interpreter.process_page(page)
                teststr=outfp.getvalue()
              
                #第一页有无年/季度报告文字,若没有,则无需查找起始页
                rs=re.search('(年\s*|季\s*)度?\s*报\s*告?',teststr)
                #print(teststr)
                if rs!=None and count==1:
                    #第一张找到年报相关文字,在下一页查找目录二字
                    flag=True
                    continue
                elif rs==None and count==1:
                    #第一页未找到年/季报相关文字,查找第二页
                    #有的报告第一张具有印章,导致提取文字不全
                    print('第1页未检测到年/季报等文字,检测第二页')
                    continue
                elif rs!=None and count==2:
                    #第二页找到了年报相关文字,在第三页查找目录
                    flag=True
                    continue
                elif rs==None and count==2:
                    #如果第1页和第二页还是没找到年/季报字眼,则认为不是年/季度报文件
                    if flag==False:
                        device.close()
                        outfp.close()
                        print('当前文件的财务报表起始页为',start_page)
                        return start_page
                #如果第一页或第二页出现年报或季度报告字眼,则在前10页查找目录页
                if flag==True:
                    #1 对前10页进行处理
                    if count<11:
                        #查找目录页
                        if re.search('目\s*录',teststr,flags=0):

                            #查看含有目录两字的当前页中是否具有财务报表相关的目录名

                            #reg_stmt = re.compile(r'财务报告\D{10,}(\d{1,3})')
                            
                            ret = re.search('财务报告\s*(.)*\d', teststr)
                            if ret!=None:
                                ret=ret.group()
                                #去除空格
                                tstr=[y.strip() for y in re.split(r'[…¨ .]',ret) if len(y)!=0]
                                #第一个值未目录名,第二个值为页码
                                start_page=int(tstr[1])
                                device.close()
                                outfp.close()
                                print('当前文件的财务报表起始页为',start_page)
                                return start_page
                            else:
                                #含有目录两字的当前页未找到财务报表相关文字,对下一页处理
                                count+=1
                                continue
                        else:
                            #当前页未找到目录文字,继续判断下一页
                            print('第',count,'页未找到目录二字,查找下一页')
                            continue
                    else:
                        print('10页内未找到目录二字')
                        #10页内未找到目录页,则退出循环
                        break          

    else:
        #不超过30页不处理
        print('当前文件的财务报表起始页为',start_page)
        return start_page
    
    device.close()
    outfp.close()
    print('当前文件的财务报表起始页为',start_page)
    return start_page
def createDeviceInterpreter():
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    return device, interpreter
Example #6
0
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from subprocess import call
from pdfminer.layout import LAParams
import os

url = 'http://www.ird.gov.hk/chi/pdf/c_s88list.pdf'
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
pdfdata = opener.open(url).read()
file = open('document.pdf', 'wb')
file.write(pdfdata)
file.close()
call('qpdf --password= --decrypt {0}/document.pdf {0}/decrypted.pdf'.format(
    os.getcwd()).split())

outfp = open('modifiedla.txt', 'w')
parser = PDFParser(open('decrypted.pdf', 'rb'))
document = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
laparams = LAParams(char_margin=10)
device = TextConverter(rsrcmgr, outfp, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(document):
    interpreter.process_page(page)

outfp.close()
Example #7
0
def main(argv):
    def usage():
        print((
            'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
            '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
            '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...'
            % argv[0]))
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:],
                                     'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    debug = False
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    outdir = None
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug = True
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': outdir = v
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)

    if debug:
        set_debug_logging()
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore')
        close_outfp = True
    else:
        outfp = sys.stdout
        close_outfp = False
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, laparams=laparams)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, laparams=laparams, outdir=outdir)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               outdir=outdir,
                               debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp)
    else:
        return usage()
    for fname in args:
        fp = io.open(fname, 'rb')
        process_pdf(rsrcmgr,
                    device,
                    fp,
                    pagenos,
                    maxpages=maxpages,
                    password=password,
                    caching=caching,
                    check_extractable=True)
        fp.close()
    device.close()
    if close_outfp:
        outfp.close()
def request_pdf(url, case_id, court_name):
    try:
        response = requests.request("GET", url, verify=False, proxies=proxy_dict)
        if response.status_code == 200:
            res = response.text

            if res is None:
                logging.error("No data for: " + str(case_id))
                return "NULL"

            file_path = module_directory + "/../Data_Files/PDF_Files/" + court_name + "_" + slugify(case_id) + ".pdf"
            fw = open(file_path, "wb")
            fw.write(response.content)

            text_data = ""

            pdf_manager = PDFResourceManager()
            string_io = StringIO()
            pdf_to_text = TextConverter(pdf_manager, string_io, codec='utf-8', laparams=LAParams())
            interpreter = PDFPageInterpreter(pdf_manager, pdf_to_text)
            for page in PDFPage.get_pages(open(file_path, 'rb')):
                interpreter.process_page(page)
                text_data = string_io.getvalue()

            file_path = module_directory + "/../Data_Files/Text_Files/" + court_name + "_" + slugify(case_id) + ".txt"
            fw = open(file_path, "w")
            fw.write(str(text_data))

            return str(text_data)
        else:
            logging.error("Failed to get text file for: " + str(case_id))
            return "NULL"

    except Exception as e:
        logging.error("Failed to get pdf file for: " + str(case_id) + ". Error: %s", e)
        return "NULL"
Example #9
0
    def __init__(self, pdf_stream, password='', pagenos=[], maxpages=0):
        ReaderBackend.__init__(self)
        self.pdf_stream = pdf_stream

        # Extract Metadata
        parser = PDFParser(pdf_stream)
        doc = PDFDocument(parser, password=password, caching=True)
        if doc.info:
            for k in doc.info[0]:
                v = doc.info[0][k]
                # print(repr(v), type(v))
                if isinstance(v, (bytes, str, unicode)):
                    self.metadata[k] = make_compat_str(v)
                elif isinstance(v, (psparser.PSLiteral, psparser.PSKeyword)):
                    self.metadata[k] = make_compat_str(v.name)

        # Secret Metadata
        if 'Metadata' in doc.catalog:
            metadata = resolve1(doc.catalog['Metadata']).get_data()
            # print(metadata)  # The raw XMP metadata
            # print(xmp_to_dict(metadata))
            self.metadata.update(xmp_to_dict(metadata))
            # print("---")

        # Extract Content
        text_io = BytesIO()
        rsrcmgr = PDFResourceManager(caching=True)
        converter = TextConverter(rsrcmgr,
                                  text_io,
                                  codec="utf-8",
                                  laparams=LAParams(),
                                  imagewriter=None)
        interpreter = PDFPageInterpreter(rsrcmgr, converter)

        self.metadata["Pages"] = 0
        self.curpage = 0
        for page in PDFPage.get_pages(self.pdf_stream,
                                      pagenos=pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=True,
                                      check_extractable=False):
            # Read page contents
            interpreter.process_page(page)
            self.metadata["Pages"] += 1
            self.curpage += 1

            # Collect URL annotations
            # try:
            if page.annots:
                refs = self.resolve_PDFObjRef(page.annots)
                if refs:
                    if isinstance(refs, list):
                        for ref in refs:
                            if ref:
                                self.references.add(ref)
                    elif isinstance(refs, Reference):
                        self.references.add(refs)

            # except Exception as e:
            # logger.warning(str(e))

        # Remove empty metadata entries
        self.metadata_cleanup()

        # Get text from stream
        self.text = text_io.getvalue().decode("utf-8")
        text_io.close()
        converter.close()
        # print(self.text)

        # Extract URL references from text
        for url in extractor.extract_urls(self.text):
            self.references.add(Reference(url, self.curpage))

        for ref in extractor.extract_arxiv(self.text):
            self.references.add(Reference(ref, self.curpage))

        for ref in extractor.extract_doi(self.text):
            self.references.add(Reference(ref, self.curpage))
Example #10
0
    def __init__(
        self,
        file,
        merge_tags=('LTChar', 'LTAnno'),
        round_floats=True,
        round_digits=3,
        input_text_formatter=None,
        normalize_spaces=True,
        resort=True,
        parse_tree_cacher=None,
        laparams={
            'all_texts': True,
            'detect_vertical': True
        },
    ):
        # store input
        self.merge_tags = merge_tags
        self.round_floats = round_floats
        self.round_digits = round_digits
        self.resort = resort

        # set up input text formatting function, if any
        if input_text_formatter:
            self.input_text_formatter = input_text_formatter
        elif normalize_spaces:
            r = re.compile(r'\s+')
            self.input_text_formatter = lambda s: re.sub(r, ' ', s)
        else:
            self.input_text_formatter = None

        # open doc
        if not hasattr(file, 'read'):
            try:
                file = open(file, 'rb')
            except TypeError:
                raise TypeError("File must be file object or filepath string.")

        parser = PDFParser(file)
        if hasattr(QPDFDocument, 'set_parser'):
            # pdfminer < 20131022
            doc = QPDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
        else:
            # pdfminer >= 20131022
            doc = QPDFDocument(parser)
            parser.set_document(doc)
        if hasattr(doc, 'initialize'):
            # as of pdfminer==20140328, "PDFDocument.initialize() method is
            # removed and no longer needed."
            doc.initialize()
        self.doc = doc
        self.parser = parser
        self.tree = None
        self.pq = None
        self.file = file

        if parse_tree_cacher:
            self._parse_tree_cacher = parse_tree_cacher
            self._parse_tree_cacher.set_hash_key(self.file)
        else:
            self._parse_tree_cacher = DummyCache()

        # set up layout parsing
        rsrcmgr = PDFResourceManager()
        if type(laparams) == dict:
            laparams = LAParams(**laparams)
        self.device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)

        # caches
        self._pages = []
        self._pages_iter = None
        self._elements = []
Example #11
0
def pdf_to_csv(filename):
    from cStringIO import StringIO
    from pdfminer.converter import LTChar, TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfparser import PDFParser
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.pdfdocument import PDFDocument
    from pdfminer.pdfpage import PDFPage

    class CsvConverter(TextConverter):
        def __init__(self, *args, **kwargs):
            TextConverter.__init__(self, *args, **kwargs)

        def end_page(self, i):
            from collections import defaultdict

            lines = defaultdict(lambda: {})
            for child in self.cur_item._objs:  # <-- changed
                if isinstance(child, LTChar):
                    (_, _, x, y) = child.bbox
                    line = lines[int(-y)]
                    line[x] = child._text.encode(self.codec)  #<-- changed

            for y in sorted(lines.keys()):
                line = lines[y]
                self.outfp.write(";".join(line[x]
                                          for x in sorted(line.keys())))
                self.outfp.write("\n")

    # ... the following part of the code is a remix of the
    # convert() function in the pdfminer/tools/pdf2text module
    rsrc = PDFResourceManager()
    outfp = StringIO()
    device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())
    # becuase my test documents are utf-8 (note: utf-8 is the default codec)

    fp = open(filename, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    parser.set_document(doc)
    # doc.set_parser(parser)
    # doc.initialize('')

    interpreter = PDFPageInterpreter(rsrc, device)
    pagenos = set()
    rotation = 0
    i = 1
    for page in PDFPage.get_pages(fp, pagenos):
        page.rotate = (page.rotate + rotation) % 360
        outfp.write("START PAGE %d\n" % i)
        interpreter.process_page(page)
        outfp.write("END PAGE %d\n" % i)
        i += 1

    # for i, page in enumerate(doc.get_pages()):
    # outfp.write("START PAGE %d\n" % i)
    #     if page is not None:
    #         interpreter.process_page(page)
    #     outfp.write("END PAGE %d\n" % i)

    device.close()
    fp.close()
    return outfp.getvalue()
Example #12
0
def parse():
    # 以二进制读模式打开
    fb = open(path, 'rb')
    # 创建一个pdf文档分析器
    parser = PDFParser(fb)
    # 创建一个pdf文档对象
    doc = PDFDocument()

    # 连接分析器与文档对象
    parser.set_document(doc)
    doc.set_parser(parser)

    # 提供初始化密码
    # 如果没有密码,就创建一个空字符串
    doc.initialize()
    obj = {}
    amount = 0
    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # pdf 资源管理器,来管理共享资源
        resource = PDFResourceManager()
        # 参数分析器
        laparam = LAParams()
        # 聚合器
        device = PDFPageAggregator(resource, laparams=laparam)
        # 创建PDF解释器
        interpreter = PDFPageInterpreter(resource, device)

        # 循环遍历列表,每次处理一个page的内容 doc.get_pages() 获取page列表
        for index, page in enumerate(doc.get_pages()):
            # if index < 3:
            #     continue
            # if index == 4:
            #     break

            # 使用页面解释器来读取
            interpreter.process_page(page)
            # 使用聚合器获取内容
            layout = device.get_result()

            for out in layout:
                if hasattr(out, "get_text"):
                    # print(out.get_text())
                    # 去除无法识别的文字转化成的 (cid:12) 之类的代码
                    t = re.sub(r'\(cid:[\d]*\)', '', out.get_text())
                    # 去除特殊内容,如数字、's、'm、're、n't
                    tx = re.sub(r'(\d+|\'s|\'m|\'re|n\'t)', '', t)
                    # 去除标点符号,且将多个空格转化为一个空格
                    txt = re.sub(
                        r'[\s+\?\.\!\/_,`:;\-$%^*\[\]\{\})(+\"\']+|[+——!,。?、‘’“”~@#¥%……&*():]+',
                        ' ', tx)
                    for word in txt.split():
                        # 跳过非英语单词
                        if not is_english(word):
                            continue
                        # 将单词转化为小写
                        w = word.lower()
                        amount = amount + 1
                        if obj.__contains__(w):
                            obj[w] = obj[w] + 1
                        else:
                            obj[w] = 1

    db = connect()
    # 获取会话指针
    cursor = db.cursor()

    # 创建表
    cursor.execute('CREATE TABLE IF NOT EXISTS ' + tablename +
                   '(word varchar(255) NOT NULL, ' +
                   'count int NOT NULL, probability float NOT NULL, ' +
                   'PRIMARY KEY (word))')

    # 清空 words 表,避免受前一次计算结果影响
    cursor.execute('truncate table ' + tablename)
    for key in obj:
        # 创建一条sql语句
        sql = 'REPLACE INTO ' + tablename + ' (word, count, probability) VALUES(%s, %s, %s)'
        # 执行sql语句
        cursor.execute(sql,
                       (key, obj[key], round(obj[key] / amount * 10000, 2)))
        # 提交
        db.commit()

    # 断开数据库连接
    db.close()
    print("总词数: %s" % amount)
def main(argv):
    import getopt

    def usage():
        print(
            f'usage: {argv[0]} [-P password] [-o output] [-t text|html|xml|tag]'
            ' [-O output_dir] [-c encoding] [-s scale] [-R rotation]'
            ' [-Y normal|loose|exact] [-p pagenos] [-m maxpages]'
            ' [-S] [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin]'
            ' [-W word_margin] [-F boxes_flow] [-d] input.pdf ...')
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:],
                                     'dP:o:t:O:c:s:R:Y:p:m:SCnAVM:W:L:F:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = b''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    encoding = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-P': password = v.encode('ascii')
        elif k == '-o': outfile = v
        elif k == '-t': outtype = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-c': encoding = v
        elif k == '-s': scale = float(v)
        elif k == '-R': rotation = int(v)
        elif k == '-Y': layoutmode = v
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-S': stripcontrol = True
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = open(outfile, 'w', encoding=encoding)
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr,
                               outfp,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=stripcontrol)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter,
                               debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp)
    else:
        return usage()
    for fname in args:
        with open(fname, 'rb') as fp:
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.get_pages(fp,
                                          pagenos,
                                          maxpages=maxpages,
                                          password=password,
                                          caching=caching,
                                          check_extractable=True):
                page.rotate = (page.rotate + rotation) % 360
                interpreter.process_page(page)
    device.close()
    outfp.close()
    return
def get_text_from_pdf(path, page_nums=None):
    r = []

    fp = open(path, 'rb')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = PDFPage.get_pages(fp, pagenos=page_nums)

    def parse_obj(lt_objs):
        # https://stackoverflow.com/questions/31819862/python-pdf-mining-get-position-of-text-on-every-line
        # loop over the object list

        for obj in lt_objs:
            if isinstance(obj, LTTextLine):
                x1, y1, x2, y2 = obj.bbox
                assert x1 < x2
                assert y1 < y2

                y1 = 1400 - y1
                y2 = 1400 - y2
                y1, y2 = y2, y1

                text = obj.get_text()
                width = obj.width
                height = obj.height

                text = text.replace('東久留米武蔵村山', '東久留米 武蔵村山')  # HACK!

                for line_i, line in enumerate(
                        text.split('\n')):  # CHECK WHETHER THIS IS NEEDED!
                    for word_j, word in enumerate(line.split()):
                        each_height = height / text.count('\n')
                        i_y1 = y1 + each_height * line_i
                        i_y2 = y2 + each_height * (line_i + 1)

                        each_width = width / len(line.split())
                        i_x1 = x1 + each_width * word_j
                        i_x2 = x2 + each_width * (word_j + 1)

                        r.append(
                            TextItem(text=word,
                                     x1=i_x1,
                                     y1=i_y1,
                                     x2=i_x2,
                                     y2=i_y2,
                                     width=each_width,
                                     height=each_height))

            # if it's a textbox, also recurse
            if isinstance(obj, LTTextBoxHorizontal):
                parse_obj(obj._objs)

            # if it's a container, recurse
            elif isinstance(obj, LTFigure):
                parse_obj(obj._objs)

    for page in pages:
        print('Processing next page...')
        interpreter.process_page(page)
        layout = device.get_result()

        for lobj in layout:
            if isinstance(lobj, LTTextBox):
                parse_obj(lobj)

    for xx in range(5):
        dists = []

        for x in range(len(r)):
            for y in range(len(r)):
                text_item_1 = r[x]
                text_item_2 = r[y]

                dists.append((abs(text_item_1.y1 - text_item_2.y1), x, y))

        merged = set()
        for dist, x, y in sorted(dists):
            text_item_1 = r[x]
            text_item_2 = r[y]

            text_1_num = all(i.isnumeric() or i in ',()'
                             for i in text_item_1.text.strip())
            text_2_num = all(i.isnumeric() or i in ',()'
                             for i in text_item_2.text.strip())

            if not dist:
                continue
            elif text_1_num != text_2_num:
                continue
            elif y in merged:
                continue
            merged.add(y)

            if dist <= 18:  # NOTE ME: This threshold may need to be tuned!!! =====================================
                r[y] = TextItem(text=text_item_2.text,
                                x1=text_item_2.x1,
                                y1=text_item_1.y1,
                                x2=text_item_2.x2,
                                y2=text_item_1.y1 + text_item_2.height,
                                width=text_item_2.width,
                                height=text_item_2.height)

    r.sort(key=lambda x: (x.y1, x.x1, x.x2, x.y2))
    #for i in r:
    #    print(i)
    return r
Example #15
0
def arc():
    destino = str(formato.get())
    if destino == "Arquivo do Word": destino = "docx"
    if destino == "Arquivo do Power-Point": destino = "ppt"
    if destino == "Arquivo do Excel": destino = "xlsx"
    if destino == "Arquivo de Texto": destino = "txt"
    import win32com.client as win32
    from os import path
    in_file = path.abspath(diretorio)
    out_file = path.abspath(filename)

    if destino == "docx":
        if file_extension in ArqDOCX or file_extension.lower(
        ) == ".pdf" or file_extension.lower() == ".txt":
            word = win32.DispatchEx("Word.Application")
            word.Visible = 0
            word.DisplayAlerts = 0
            doc = word.Documents.Open(in_file)
            doc.SaveAs(out_file, FileFormat=16)
            doc.Close()
            word.Quit()

    elif destino.lower() == "pdf":
        if file_extension.lower() in ArqPPT:
            word = win32.DispatchEx("PowerPoint.Application")
            word.Visible = 0
            word.DisplayAlerts = 0
            doc = word.Presentations.Open(in_file)
            doc.SaveAs(out_file, FileFormat=32)
            doc.Close()
            word.Quit()
        elif file_extension.lower() in ArqXLSX:
            word = win32.DispatchEx("Excel.Application")
            word.Visible = 0
            word.DisplayAlerts = 0
            doc = word.Workbooks.Open(in_file)
            doc.ExportAsFixedFormat(0, out_file)
            doc.Close()
            word.Quit()
        elif file_extension.lower() in ArqDOCX or file_extension.lower(
        ) == ".txt":
            word = win32com.client.Dispatch('Word.Application')
            word.Visible = 0
            word.DisplayAlerts = 0
            doc = word.Documents.Open(in_file)
            doc.SaveAs(in_file, FileFormat=17)
            doc.Close()
            word.Quit()

    elif destino.lower() == "xlsx":
        if file_extension.lower() == ".pdf":
            import pdftables_api
            c = pdftables_api.Client('to7jluln0hvr')
            c.xlsx(diretorio, filename + '.xlsx')
        elif file_extension.lower() == ".txt" or file_extension.lower(
        ) in ArqDOCX:
            import pandas as pd
            df = pd.read_csv(diretorio, header=None, delim_whitespace=True)
            df.to_excel(filename + '.xlsx', index=False, header=None)

    elif destino.lower() == "txt":
        if file_extension in ArqDOCX:
            import docx2txt
            text = docx2txt.process(diretorio)
            with open(filename + ".txt", "w") as file:
                print(text, file=file)
        elif file_extension.lower() == ".pdf":
            from io import StringIO
            from pdfminer.pdfparser import PDFParser
            from pdfminer.pdfdocument import PDFDocument
            from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
            from pdfminer.converter import TextConverter
            from pdfminer.layout import LAParams
            from pdfminer.pdfpage import PDFPage
            output_string = StringIO()
            with open(diretorio, 'rb') as in_file:
                parser = PDFParser(in_file)
                doc = PDFDocument(parser)
                rsrcmgr = PDFResourceManager()
                device = TextConverter(rsrcmgr,
                                       output_string,
                                       laparams=LAParams())
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                for page in PDFPage.create_pages(doc):
                    interpreter.process_page(page)
            with open(filename + ".txt", "w") as final:
                final.write(output_string.getvalue())
        elif file_extension.lower() in ArqXLSX:
            import pandas as pd
            read_file = pd.read_excel(diretorio, header=None)
            read_file.to_csv(filename + ".txt", index=None, header=True)

    messagebox.showinfo(
        "Formato convertido",
        "Formato de ficheiro convertido com sucesso.\n\n" +
        file_extension[1:].upper() + " para " + destino.upper() +
        "\n\nSalvo em: " + out_file + "." + destino)
    root.destroy()
Example #16
0
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import TextConverter
from pdfminer.cmapdb import CMapDB
from pdfminer.layout import LAParams

from MoinMoin import log
logging = log.getLogger(__name__)

LAPARAMS = LAParams(
    # value is specified not as an actual length, but as a proportion of the length to the
    # size of each character in question.
    # two text chunks whose distance is closer than the char_margin is considered
    # continuous and get grouped into one.
    char_margin=0.3,
    # it may be required to insert blank characters (spaces) as necessary if the distance
    # between two words is greater than the word_margin, as a blank between words might
    # not be represented as a space, but indicated by the positioning of each word.
    word_margin=0.2,
    # two lines whose distance is closer than the line_margin is grouped as a text box,
    # which is a rectangular area that contains a "cluster" of text portions.
    line_margin=0.3,
)


class UnicodeConverter(TextConverter):
    # as result, we want a unicode object
    # TextConverter only provides encoded output into a file-like object
    def __init__(self, rsrcmgr, pageno=1, laparams=None, showpageno=False):
        TextConverter.__init__(self,
                               rsrcmgr,
                               None,
Example #17
0
def parse():
    # rb以二进制读模式打开本地pdf文件

    fn = open('半监督模糊聚类及其应用_杨昔阳.pdf', 'rb')

    # 创建一个pdf文档分析器

    parser = PDFParser(fn)

    # 创建一个PDF文档

    doc = PDFDocument()

    # 连接分析器 与文档对象

    parser.set_document(doc)

    doc.set_parser(parser)

    # 提供初始化密码doc.initialize("lianxipython")

    # 如果没有密码 就创建一个空的字符串

    doc.initialize("")

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:

        raise PDFTextExtractionNotAllowed

    else:

        # 创建PDf资源管理器

        resource = PDFResourceManager()

    # 创建一个PDF参数分析器

    laparams = LAParams()

    # 创建聚合器,用于读取文档的对象

    device = PDFPageAggregator(resource, laparams=laparams)

    # 创建解释器,对文档编码,解释成Python能够识别的格式

    interpreter = PDFPageInterpreter(resource, device)

    # 循环遍历列表,每次处理一页的内容

    # doc.get_pages() 获取page列表
    for page in doc.get_pages():

        # 利用解释器的process_page()方法解析读取单独页数

        interpreter.process_page(page)

    # 使用聚合器get_result()方法获取内容

    layout = device.get_result()

    # 这里layout是一个LTPage对象,里面存放着这个page解析出的各种对象
    for out in layout:

        # 判断是否含有get_text()方法,获取我们想要的文字
        if hasattr(out, "get_text"):

            print(out.get_text())

        with open('test.txt', 'a') as f:

            f.write(out.get_text() + '\n')

    if __name__ == '__main__':

        parse()
Example #18
0
def Converting_Function(Path_To_TXTs, new_file):
    """
    :param Path_To_TXTs: path to PDFs or/and XML files
    :param new_file: the path to save the TXT format
    """
    files_short = np.array([
        f for f in os.listdir(Path_To_TXTs)
        if os.path.isfile(os.path.join(Path_To_TXTs, f))
    ])
    files = np.array([Path_To_TXTs + '/' + f for f in files_short])
    for file in files:
        if file.endswith('.pdf'):
            Not_Good = False
            Prob = False
            try:
                fp = open(file, 'rb')
                parser_pdf = PDFParser(fp)
                doc = PDFDocument(parser_pdf)
                rsrcmgr = PDFResourceManager()
                laparams = LAParams()
                device = PDFPageDetailedAggregator(rsrcmgr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                for page in PDFPage.create_pages(doc):
                    interpreter.process_page(page)
                    device.get_result()
                rows = device.rows
                lines = [item[5] for item in rows]
                if average_len(lines) >= 20:
                    try:
                        text_all = convert_pdf_to_txt(file, pages=[0])
                        rows_pages = [item for item in rows if item[0] != 0]
                        words = [item[1] for item in rows_pages]
                        words_1 = [item for item in words if item <= 200]
                        words_2 = [item for item in words if item > 200]
                        first = most_common(words_1)
                        second = most_common(words_2)
                        pages = [item[0] for item in rows_pages]
                        pages = list(set(pages))
                        pages.sort()
                        for page in pages:
                            page_lines = [
                                line for line in rows_pages if line[0] == page
                            ]
                            text1 = ''
                            text2 = ''
                            text_middle = ''
                            for item in page_lines:
                                if item[1] <= (first + 20) and not (
                                        item[5].isdigit()
                                        and not item[5].endswith('.')):
                                    text1 = text1 + '\n' + item[5]
                                elif item[1] >= (
                                        second -
                                        20) and item[1] <= 500 and not (
                                            item[5].isdigit()
                                            and not item[5].endswith('.')):
                                    text2 = text2 + '\n' + item[5]
                                else:
                                    if not (item[5].isdigit()
                                            and not item[5].endswith('.')):
                                        text_middle = text_middle + '\n' + item[
                                            5]
                            if len(text1 + text2) > len(text_middle):
                                text_all = text_all + text1 + text_middle + text2
                            else:
                                Not_Good = True
                        if len(text_all) >= 1500 and Not_Good == False:
                            text_all = text_all.replace(' ac.', '~').replace(
                                ' a.c.', '~').replace(' a.c', '~')
                            name = file.split('/')[-1][:-4]
                            path = new_file + '/' + name + '.txt'
                            with open(path, 'w', encoding='utf8') as f:
                                f.write(text_all)
                                f.close()
                            #print('Article ', name, ' is successfully converted')
                        elif len(text_all) >= 1500 and Not_Good == True:
                            rawText = parser.from_file(file)
                            text = rawText['content']
                            text = os.linesep.join(
                                [s for s in text.splitlines() if s])
                            text_all = text.replace(' ac.', '~').replace(
                                ' a.c.', '~').replace(' a.c', '~')
                            text_all = " ".join(text_all.split())
                            name = file.split('/')[-1][:-4]
                            path = new_file + '/' + name + '.txt'
                            with open(path, 'w', encoding='utf8') as f:
                                f.write(text_all)
                                f.close()
                            #print('Article ', name, ' is successfully converted')
                        else:
                            raw = parser.from_file(file)
                            text_all = raw['content']
                            text_all = "\n".join([
                                ll.rstrip() for ll in text_all.splitlines()
                                if ll.strip()
                            ])
                            if len(text_all) >= 1500:
                                text_all = text_all.replace(
                                    ' ac.',
                                    '~').replace(' a.c.',
                                                 '~').replace(' a.c', '~')
                                name = file.split('/')[-1][:-4]
                                path = new_file + '/' + name + '.txt'
                                with open(path, 'w', encoding='utf8') as f:
                                    f.write(text_all)
                                    f.close()
                                #print('Article ', name, ' is successfully converted')
                            else:
                                pass
                                #print('The PDF "' + file + '" contain less than 1500 characters !!!')
                    except:
                        Prob = True
                elif average_len(lines) < 20 or Prob == True:
                    raw = parser.from_file(file)
                    text_all = raw['content']
                    text_all = "\n".join([
                        ll.rstrip() for ll in text_all.splitlines()
                        if ll.strip()
                    ])
                    if len(text_all) >= 1500:
                        text_all = text_all.replace(' ac.', '~').replace(
                            ' a.c.', '~').replace(' a.c', '~')
                        name = file.split('/')[-1][:-4]
                        path = new_file + '/' + name + '.txt'
                        with open(path, 'w', encoding='utf8') as f:
                            f.write(text_all)
                            f.close()
                        #print('Article ', name, ' is successfully converted')
                    else:
                        pass
                        #print('The PDF "' + file + '" contain less than 1500 characters !!!')
            except:
                Prob = True
            if Prob == True:
                raw = parser.from_file(file)
                text_all = raw['content']
                text_all = "\n".join([
                    ll.rstrip() for ll in text_all.splitlines() if ll.strip()
                ])
                if len(text_all) >= 1500:
                    text_all = text_all.replace(' ac.', '~').replace(
                        ' a.c.', '~').replace(' a.c', '~')
                    name = file.split('/')[-1][:-4]
                    path = new_file + '/' + name + '.txt'
                    with open(path, 'w', encoding='utf8') as f:
                        f.write(text_all)
                        f.close()
                    #print('Article ', name, ' is successfully converted')
                else:
                    pass
                    #print('The PDF "' + file + '" contain less than 1500 characters !!!')
        elif file.endswith('.xml'):
            text_all = get_text_from_XML_without_saving(file)
            text_all = text_all.split('competing financial interest')[0]
            text_all = text_all.replace(' ac.',
                                        '~').replace(' a.c.',
                                                     '~').replace(' a.c', '~')
            name = file.split('/')[-1][:-4]
            path = new_file + '/' + name + '.txt'
            with open(path, 'w', encoding='utf8') as f:
                f.write(text_all)
                f.close()
Example #19
0
File: pdf.py Project: guix77/weboob
def get_pdf_rows(data, miner_layout=True):
    """
    Takes PDF file content as string and yield table row data for each page.

    For each page in the PDF, the function yields a list of rows.
    Each row is a list of cells. Each cell is a list of strings present in the cell.
    Note that the rows may belong to different tables.

    There are no logic tables in PDF format, so this parses PDF drawing instructions
    and tries to find rectangles and arrange them in rows, then arrange text in
    the rectangles.

    External dependencies:
    PDFMiner (http://www.unixuser.org/~euske/python/pdfminer/index.html).
    """

    try:
        from pdfminer.pdfparser import PDFParser, PDFSyntaxError
    except ImportError:
        raise ImportError('Please install python-pdfminer')

    try:
        from pdfminer.pdfdocument import PDFDocument
        from pdfminer.pdfpage import PDFPage
        newapi = True
    except ImportError:
        from pdfminer.pdfparser import PDFDocument
        newapi = False
    from pdfminer.converter import PDFPageAggregator
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.layout import LAParams, LTRect, LTTextBox, LTTextLine, LTLine, LTChar, LTCurve

    parser = PDFParser(BytesIO(data))
    try:
        if newapi:
            doc = PDFDocument(parser)
        else:
            doc = PDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
    except PDFSyntaxError:
        return

    rsrcmgr = PDFResourceManager()
    if miner_layout:
        device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
    else:
        device = PDFPageAggregator(rsrcmgr)

    interpreter = PDFPageInterpreter(rsrcmgr, device)
    if newapi:
        pages = PDFPage.get_pages(BytesIO(data), check_extractable=True)
    else:
        doc.initialize()
        pages = doc.get_pages()

    if LOGGER.isEnabledFor(DEBUGFILES):
        import tempfile
        import PIL.Image as Image
        import PIL.ImageDraw as ImageDraw
        import random

        path = tempfile.mkdtemp(prefix='pdf')

    for npage, page in enumerate(pages):
        LOGGER.debug('processing page %s', npage)
        interpreter.process_page(page)
        page_layout = device.get_result()

        texts = sum([
            list(lttext_to_multilines(obj, page_layout))
            for obj in page_layout._objs
            if isinstance(obj, (LTTextBox, LTTextLine, LTChar))
        ], [])
        LOGGER.debug('found %d text objects', len(texts))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB',
                            (int(page.mediabox[2]), int(page.mediabox[3])),
                            (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for t in texts:
                color = (random.randint(127, 255), random.randint(127, 255),
                         random.randint(127, 255))
                draw.rectangle((t.x0, t.y0, t.x1, t.y1), outline=color)
                draw.text((t.x0, t.y0), t.text.encode('utf-8'), color)
            fpath = '%s/1text-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        if not miner_layout:
            texts.sort(key=lambda t: (t.y0, t.x0))

        # TODO filter ltcurves that are not lines?
        # TODO convert rects to 4 lines?
        lines = [
            lt_to_coords(obj, page_layout) for obj in page_layout._objs
            if isinstance(obj, (LTRect, LTLine, LTCurve))
        ]
        LOGGER.debug('found %d lines', len(lines))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB',
                            (int(page.mediabox[2]), int(page.mediabox[3])),
                            (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for l in lines:
                color = (random.randint(127, 255), random.randint(127, 255),
                         random.randint(127, 255))
                draw.rectangle((l.x0, l.y0, l.x1, l.y1), outline=color)
            fpath = '%s/2lines-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        lines = list(uniq_lines(lines))
        LOGGER.debug('found %d unique lines', len(lines))

        rows = build_rows(lines)
        LOGGER.debug('built %d rows (%d boxes)', len(rows),
                     sum(len(row) for row in rows))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB',
                            (int(page.mediabox[2]), int(page.mediabox[3])),
                            (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for r in rows:
                for b in r:
                    color = (random.randint(127,
                                            255), random.randint(127, 255),
                             random.randint(127, 255))
                    draw.rectangle((b.x0 + 1, b.y0 + 1, b.x1 - 1, b.y1 - 1),
                                   outline=color)
            fpath = '%s/3rows-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        textrows = arrange_texts_in_rows(rows, texts)
        LOGGER.debug('assigned %d strings',
                     sum(sum(len(c) for c in r) for r in textrows))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB',
                            (int(page.mediabox[2]), int(page.mediabox[3])),
                            (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for row, trow in zip(rows, textrows):
                for b, tlines in zip(row, trow):
                    color = (random.randint(127,
                                            255), random.randint(127, 255),
                             random.randint(127, 255))
                    draw.rectangle((b.x0 + 1, b.y0 + 1, b.x1 - 1, b.y1 - 1),
                                   outline=color)
                    draw.text((b.x0 + 1, b.y0 + 1),
                              '\n'.join(tlines).encode('utf-8'), color)
            fpath = '%s/4cells-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        yield textrows
    device.close()
Example #20
0
def parse_webpages(webpages):
    for page in webpages:
        # obtain the robots.txt url
        r = Robots.robots_url(page)
        robots = Robots.fetch(r)
        if (robots.allowed(page, '*')):
            # sitemaps is a list of all the sitemaps for a website
            sitemaps = robots.sitemaps
            sitemaps_list = list(sitemaps)
            html = requests.get(page)  # html of the webpage
            soup = bs4.BeautifulSoup(html.text, "html.parser")
            outlinks = soup.find_all("a")  # all the outlinks
            links = [str(i.get('href')) for i in outlinks]
            outlinks = [str(i) for i in outlinks]
            docs = []  # the documents on the page

            for file in links:
                directory = page.rsplit("/", 1)[0]
                link = directory + '/' + file

                # can be expanded to other file types with a comma
                if file.endswith(('txt', 'md')):
                    if file.startswith(('http://', 'www.')):
                        text = bs4.BeautifulSoup(
                            requests.get(file).text, "html.parser")
                        ext = file.rsplit(".", 1)[-1]
                        text = [file, ext, text]
                        # text = {'link': link, 'ext': ext, 'text': text}
                        docs.append(text)
                    else:
                        text = bs4.BeautifulSoup(
                            requests.get(link).text, "html.parser")
                        ext = link.rsplit(".", 1)[-1]
                        text = [link, ext, text]
                        # text = {'link': link, 'ext': ext, 'text': text}
                        docs.append(text)
                elif file.endswith(('pdf')):  # special case if PDF
                    x = file
                    try:
                        if file.startswith(('http://', 'www.')):
                            pdf = file.rsplit("/", 1)[-1]
                            response = urlopen(file)
                        else:
                            pdf = file.rsplit("/", 1)[-1]
                            # must first check if pdf is found
                            response = urlopen(link)

                    except urllib.error.HTTPError as e:
                        # if 404 error, put 404 as text
                        text = [link, "pdf", "404"]
                        # text = {'link': link, 'ext': 'pdf', 'text': "404"}
                        docs.append(text)

                    else:
                        # otherwise must save the pdf to run pypdf2
                        file = open(pdf, 'wb')
                        file.write(response.read())
                        file.close()
                        if x.startswith('http://'):
                            link = x
                        txt = ""
                        file = open(pdf, 'rb')
                        parser = PDFParser(file)
                        document = PDFDocument(parser)
                        rsrcmgr = PDFResourceManager()
                        laparams = LAParams()
                        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
                        interpreter = PDFPageInterpreter(rsrcmgr, device)
                        for p in PDFPage.create_pages(document):
                            # As the interpreter processes the page stored in PDFDocument object
                            interpreter.process_page(p)
                            # The device renders the layout from interpreter
                            layout = device.get_result()
                            # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
                            for lt_obj in layout:
                                if isinstance(lt_obj, LTTextBox) or isinstance(
                                        lt_obj, LTTextLine):
                                    txt += lt_obj.get_text()

                        # close the pdf file
                        file.close()
                        name = [link, "pdf", txt]
                        # name = {'link': link, 'ext': 'pdf', 'text': txt}
                        os.remove(pdf)  # remove the saved file when done
                        docs.append(name)

            docs = [[str(i) for i in lis] for lis in docs]
            timestamp = datetime.datetime.now().isoformat()
            output = {
                'url': page,
                'timestamp': timestamp,
                'outlinks': outlinks,
                'html': html.text,
                'docs': docs,
                'sitemaps': sitemaps_list
            }

            with Crawling_L_REST.app.app_context():
                Crawling_L_REST.add_webpage(output)

            return output
Example #21
0
            dic = {'文件名': file, '证券简称': name, '证券代码': code}
            # 获取文档对象
            fp = open('pdf/' + file, 'rb')
            # 创建一个与文档关联的解释器
            parser = PDFParser(fp)
            # pdf文档的对象
            doc = PDFDocument()
            # 链接解释器和文档对象
            parser.set_document(doc)
            doc.set_parser(parser)
            # 初始化文档
            doc.initialize('')
            # 创建PDF资源管理器
            resource = PDFResourceManager()
            # 参数分析器
            las = LAParams()
            # 创建一个聚合器
            device = PDFPageAggregator(resource, laparams=las)
            # 创建PDF页面解释器
            interpreter = PDFPageInterpreter(resource, device)
            # 使用文档对象得到页面的集合

            # pdf 内容
            pdf_text = ''
            for page in doc.get_pages():
                # page 内容
                page_text = ''
                # 使用页面解释器来读取
                interpreter.process_page(page)
                # 使用聚合器来获得内容
                layout = device.get_result()
Example #22
0
def get_text_box(pdf_path):
    """
    :return: trả về list các box theo từng page ở dạng như thế này
    với region là danh sách tọa độ củ các block text
    còn media box là tọa độ size của từng page điểm x0=0, y0=0
    [
      { #page 1
        "region": [
            {
                "cordinate": [x0, y0, x1, y1]
                "text": "day la text"
             },
             {
                "cordinate": [x0, y0, x1, y1]
                "text": "day la text"
             },
             {
                "cordinate": [x0, y0, x1, y1]
                "text": "day la text"
             }
          ],
        "media_box":[
            x1, y1
          ]
      },
      { #page 2
        "region": [
                    {
                        "cordinate": [x0, y0, x1, y1]
                        "text": "day la text"
                     },
                     {
                        "cordinate": [x0, y0, x1, y1]
                        "text": "day la text"
                     },
                     {
                        "cordinate": [x0, y0, x1, y1]
                        "text": "day la text"
                     }
                  ],
                "media_box":[
                    x1, y1
                  ]
      }
    ]
    """
    fp = open(pdf_path, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    rsrcmgr = PDFResourceManager()
    device = PDFDevice(rsrcmgr)
    laparams = LAParams()

    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)

    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    list_all_box = []
    # loop over all pages in the document
    for page in PDFPage.create_pages(document):
        list_item = {}
        interpreter.process_page(page)
        layout = device.get_result()
        media_texbox = (int(page.mediabox[2]), int(page.mediabox[3]))
        MEDIA_Y1 = int(page.mediabox[3])
        sub_box = parse_obj(layout._objs, MEDIA_Y1)
        list_item['region'] = sub_box
        list_item['media_box'] = media_texbox
        list_all_box.append(list_item)

    return list_all_box
Example #23
0
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from AIDataConverter import AIDataConverter
from LegalDocMLconverter import LegalDocMLconverter
from XMLconverter import XMLConverter
from PDFMinerconverter import PDFMinerConverter

input_path = "../sample/NZBC-G4#3.4_13.pdf"
output_path = "../output/output.xml"
output_path2 = "../output/sample.xml"

img_output_path = "../output/pdfminer_page1.jpg"

outfp = open(output_path, "wb")
with open(input_path, 'rb') as in_file:
    parser = PDFParser(in_file)
    doc = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    #device = AIDataConverter(rsrcmgr, outfp, laparams=LAParams())
    device = LegalDocMLconverter(rsrcmgr, outfp, laparams=LAParams())
    #device = XMLConverter(rsrcmgr, outfp, laparams=LAParams())
    #device = PDFMinerConverter(rsrcmgr, outfp, laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.create_pages(doc):
        interpreter.process_page(page)
    #device.draw_layout(input_path, img_output_path)
    device.close()
outfp.close()
def convert_to_text(fname):
    pages = None
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close

    text_list = text.split('\n')
    txt = text_list[:3]
    text = ' '.join(text_list[3:])

    print("###################")
    print(txt)

    ## spliting word from string

    word_list = text.split(' ')
    string_input = ""
    flag = 0
    for word in word_list:
        # print("*********")
        # print(word)

        if (word.lower() == 'tran'):
            break
        else:
            if (word.lower() == 'customer' or word.lower() == 'scheme'
                    or word.lower() == 'currency' or word.lower() == 'for'):
                word = '\n' + word

            elif (word.lower() == 'statement'):
                word = '\n' + word
                flag = 1
            elif (word.lower() == 'account' and flag == 1):
                word = '\n' + word

        string_input += word + " "
    print("::::::::::::::::::::::")
    # print(string_input)

    file_name = fname.split('/')[-1]
    file_name = file_name.split('.')[0]
    # print(file_name)

    # write Content to .txt
    text_file = open(
        "/home/dell/Documents/Aha-Loans/Production/PdfAnalyser/output/txt/output_"
        + file_name + ".txt", "w")
    text = re.sub("\s\s+", " ", text)

    text_file.write("%s" % text)
    text_file.close()
    file_name_main = "output_" + file_name + ".csv"
    csv_file = open(
        "/home/dell/Documents/Aha-Loans/Production/PdfAnalyser/output/csv/" +
        file_name_main, "w")
    text = re.sub("\s\s+", " ", string_input)
    csv_file.write("%s" % string_input)
    csv_file.close()
    length_lines = len(string_input.split('\n'))
    # print("-----------",length_lines)
    convert_to_table(fname, string_input, txt)
def parse_pdf(pdfFile, debug, stats, current_task=None):
    stats[0] = 0
    stats[1] = 0
    reader = PdfReader(pdfFile)

    if debug:
        debugFile = os.path.splitext(pdfFile)[0] + "-debug.pdf"
        pdf_canvas = canvas.Canvas(debugFile)
        form = pdf_canvas.acroForm
        pdf_canvas.setStrokeColor(black)
        pdf_canvas.setLineWidth(0.1)
        colors = [black, red, green, blue]

    pages = []
    for page_layout in extract_pages(pdf_file=pdfFile,
                                     laparams=LAParams(line_margin=0,
                                                       char_margin=0.5)):
        pageNum = page_layout.pageid
        pageWidth = page_layout.bbox[2]
        pageHeight = page_layout.bbox[3]

        if current_task != None:
            current_task.update_state(state='PROGRESS',
                                      meta="Extracting page " + str(pageNum) +
                                      " of " + str(len(reader.pages)))

        # extract texts and lines
        textLines = []
        lines = []
        for element in page_layout:
            extract_elements(textLines, lines, element)

        # process lines
        merge_lines(lines)
        split_lines(lines)
        extract_line_features(lines, textLines, pageHeight)

        # match fields
        if (pageNum > len(reader.pages)):
            return pages
        pdf_page = reader.pages[pageNum - 1]
        if pdf_page.Annots and len(pdf_page.Annots) > 0:
            match_fields(pdf_page, lines, stats)
        pages.append(lines)

        # dump debugging info
        if debug:
            pdf_canvas.setPageSize((pageWidth, pageHeight))

            # dump text lines
            for textLine in textLines:
                left = textLine.Position.Left
                right = textLine.Position.Right
                top = textLine.Position.Top
                bottom = textLine.Position.Bottom
                form.textfield(value=textLine.Text,
                               x=left,
                               y=bottom,
                               width=right - left,
                               height=top - bottom,
                               borderWidth=0,
                               fontSize=7)

            # dump lines
            ci = 0
            i = 1
            for line in lines:
                pdf_canvas.setStrokeColor(colors[ci])
                ci = (ci + 1) % len(colors)
                pdf_canvas.setLineWidth(line.LineWidth)
                pdf_canvas.rect(line.Position.Left, line.Position.Bottom,
                                line.Position.Right - line.Position.Left,
                                line.Position.Top - line.Position.Bottom)
                # dump fields
                if (line.IsHorizontal):
                    value = "line_" + str(i)
                    if line.IsMarkupField:
                        value = line.FieldCode
                    form.textfield(value=value,
                                   x=line.FieldPosition.Left,
                                   y=line.FieldPosition.Bottom,
                                   width=line.FieldPosition.Right -
                                   line.FieldPosition.Left,
                                   height=line.FieldPosition.Top -
                                   line.FieldPosition.Bottom,
                                   borderWidth=0,
                                   fontSize=7)
                    i += 1
            pdf_canvas.showPage()

    if debug:
        pdf_canvas.save()

    return pages
Example #26
0
def main(argv):
	for arg in argv[1:]:
		fd = open(arg, 'rb')
		parser = PDFParser(fd)
		document = PDFDocument(parser)
		if not document.is_extractable:
			print("Document not extractable.")
			return 1
	
		params = LAParams(char_margin=1)
		resMan = PDFResourceManager(caching=True)
		device = PDFPageAggregator(resMan, laparams=params)
		interpreter = PDFPageInterpreter(resMan, device)
		parser = x86ManParser("html", params)
	
		i = 1
		for page in PDFPage.get_pages(fd, set(), caching=True, check_extractable=True):
			print("Processing page %i" % i)
			interpreter.process_page(page)
			page = device.get_result()
			parser.process_page(page)
			i += 1
		parser.flush()
		fd.close()

		# RG: We get the parse in just one file: html/AAA.html
		# Looks like the original repo does not create all the separate pages for all the
		# instructions nor the index.html
		# Dus gewoon een grote parse in een file met als filename de eerste instructie.
		# En later heeft ie de losse pagina's daar dan weer uit gehaald door te zoeken op <h3>
		# en de closing tags toe te voegen.
		# En zo ook een index.html gemaakt en zelf een style.css erbij.		
		# NOTE_: we are getting 3 sorts of Description: <p>, <table> and <svg>
		# Op zijn website is het alleen <p> dus hij heeft zeker nog nabewerkingen gedaan.
		# Ook toevoegen van <pre> en <h2> etc.
		# Dit is dus alleen een grove parse om de tekst en tabellen eruit te krijgen.
		# Gezien de issues (e.g. problemen met footnotes in MOV) is het waarschijnlijk beter om
		# te gaan werken met de html die pdf2txt maakt. Is wel niet zo clean maar geeft minder
		# problemen.
		# Kijkend naar de resultaten van alle pdf2html conversies ziet het er naar uit dat het toch 
		# niet zo makkelijk programmatisch gaat. zneak/felix heeft zijn best gedaan en genereert cleane 
		# html maar er zitten toch nog veel fouten in (zie issues). pdf2txt maakt een nette layout maar 
		# in de tabellen gaat het vaak mis en moet je nog veel nabewerken. Het is te vergelijken met 
		# pdf2music: soms lukt het maar meestal ziet het er niet goed uit en kun je beter alles handmatig 
		# doen. Veel werk maar geeft het beste resultaat.
		# pdf2txt gebruikt trouwens spans voor tabellen. Lelijk.
		# DONE: checked out pdftohtml from Xpdf. This produces the best looking pages. But also no real
		# tables. It uses a png file as background for the tables and then lays everything out with
		# absolutely positioned divs. For exact positioning that seems the way to go. But also slight
		# mistakes in the table layout. Faster (C++, Qt) and better than pdfminer.six.
		# But no real tables is faking it...
		# NOTE_: at autoclose we are getting mismatch: th strong when parsing the full vol2a.pdf
		# Something goes wrong.
		# Figures are extracted as svg but often look warped (e.g. Figure 3-18 and 3-19 at HADDPS)
		# PDF parsing is like unscrambling scrambled eggs...
		# DONE: checked out pdf2htmlEX. It creates perfectly looking html 5 pages. It can be done!
		# It is fast and puts everything in one html page.
		# TODO_: check out https://github.com/fmalina/unilex-transcript which promisses to create
		# clean (semantic) html from pdf2htmlEX output.

		# NOTE_: Conversion result geeft altijd 0/0 omdat we niet in de code komen waar success en fail worden
		# geincrementeerd. Het zijn dus loze variabelen.
		print("Conversion result: %i/%i" % (parser.success, parser.success + parser.fail))
Example #27
0
def get_text_from_pdf(pdfname, limit=1000):
    if (pdfname == ''):
        return ''
    else:
        # 処理するPDFファイルを開く/開けなければ
        try:
            fp = open(pdfname, 'rb')
        except:
            return ''

    # PDFからテキストの抽出
    rsrcmgr = PDFResourceManager()
    out_fp = StringIO()
    la_params = LAParams()
    la_params.detect_vertical = True
    device = TextConverter(rsrcmgr, out_fp, codec='utf-8', laparams=la_params)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp,
                                  pagenos=None,
                                  maxpages=0,
                                  password=None,
                                  caching=True,
                                  check_extractable=True):
        interpreter.process_page(page)
    text = out_fp.getvalue()
    fp.close()
    device.close()
    out_fp.close()

    # 改行で分割する
    #lines = text.splitlines()
    lines = []
    lines.append(text)

    outputs = []
    output = ""

    # 除去するutf8文字
    replace_strs = [b'\x00']

    is_blank_line = False

    # 分割した行でループ
    for line in lines:

        # byte文字列に変換
        line_utf8 = line.encode('utf-8')

        # 余分な文字を除去する
        for replace_str in replace_strs:
            line_utf8 = line_utf8.replace(replace_str, b'')

        # strに戻す
        line = line_utf8.decode()

        # 連続する空白を一つにする
        line = re.sub("[ ]+", " ", line)

        # 前後の空白を除く
        line = line.strip()
        #print("aft:[" + line + "]")

        # 空行は無視
        if len(line) == 0:
            is_blank_line = True
            continue

        # 数字だけの行は無視
        if is_float(line):
            continue

        # 1単語しかなく、末尾がピリオドで終わらないものは無視
        if line.split(" ").count == 1 and not line.endswith("."):
            continue

        # 文章の切れ目の場合
        if is_blank_line or output.endswith("."):
            # 文字数がlimitを超えていたらここで一旦区切る
            if (len(output) > limit):
                outputs.append(output)
                output = ""
            else:
                output += "\r\n"
        #前の行からの続きの場合
        elif not is_blank_line and output.endswith("-"):
            output = output[:-1]
        #それ以外の場合は、単語の切れ目として半角空白を入れる
        else:
            output += " "

        #print("[" + str(line) + "]")
        output += str(line)
        is_blank_line = False

    outputs.append(output)
    outputs.append('\n')
    return outputs
Example #28
0
def cas_pdf_to_text(filename: Union[str, io.IOBase],
                    password) -> PartialCASData:
    """
    Parse CAS pdf and returns line data.

    :param filename: CAS pdf file (CAMS or Kfintech)
    :param password: CAS pdf password
    :return: array of lines from the CAS.
    """
    file_type: Optional[FileType] = None

    if isinstance(filename, str):
        fp = open(filename, "rb")
    elif hasattr(filename, "read") and hasattr(filename,
                                               "close"):  # file-like object
        fp = filename
    else:
        raise CASParseError(
            "Invalid input. filename should be a string or a file like object")

    with fp:
        pdf_parser = PDFParser(fp)
        try:
            document = PDFDocument(pdf_parser, password=password)
        except PDFPasswordIncorrect:
            raise IncorrectPasswordError("Incorrect PDF password!")
        except PDFSyntaxError:
            raise CASParseError("Unhandled error while opening file")

        line_margin = {
            FileType.KFINTECH: 0.1,
            FileType.CAMS: 0.2
        }.get(detect_pdf_source(document), 0.2)

        rsrc_mgr = PDFResourceManager()
        laparams = LAParams(line_margin=line_margin, detect_vertical=True)
        device = PDFPageAggregator(rsrc_mgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrc_mgr, device)

        pages: List[Iterator[LTTextBoxHorizontal]] = []

        investor_info = None
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
            text_elements = filter(
                lambda x: isinstance(x, LTTextBoxHorizontal), layout)
            if file_type is None:
                for el in filter(lambda x: isinstance(x, LTTextBoxVertical),
                                 layout):
                    if re.search("CAMSCASWS", el.get_text()):
                        file_type = FileType.CAMS
                    if re.search("KFINCASWS", el.get_text()):
                        file_type = FileType.KFINTECH
            if investor_info is None:
                investor_info = parse_investor_info(layout, *page.mediabox[2:])
            pages.append(text_elements)

        lines = group_similar_rows(pages)
        return PartialCASData(file_type=file_type,
                              investor_info=investor_info,
                              lines=lines)
Example #29
0
    'This program replaces the citation links inside a pdf which just goes to the page with the ADS abstract link'
)
parser.add_argument('input', help='The input pdf file')
parser.add_argument('output', help='The processed output pdf file')

args = parser.parse_args()

inputPDFDocName = args.input
outputPDFDocName = args.output

#Standard reciepe
document = open(inputPDFDocName, 'rb')
#Create resource manager
rsrcmgr = PDFResourceManager()
# Set parameters for analysis.
laparams = LAParams()
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)

parser = PDFParser(document)
doc = PDFDocument(parser)

#Get links and thier positions, and put that info into custom objects
curPage = 0
documentParsed = {}
for page in PDFPage.get_pages(document):
    interpreter.process_page(page)
    # get the pageid order in the document
    pageObjIds.append(page.pageid)
    curPage = getPageNumWithPageObj(page)
Example #30
0
def get_pdf_file_content(path_to_pdf):
    '''
    path_to_pdf: is the parameter that will give access to the PDF File 
    we want to extract the content.
    '''
    '''
    PDFResourceManager is used to store shared resources such as fonts or images that 
    we might encounter in the files. 
    '''

    resource_manager = PDFResourceManager(caching=True)
    '''
    create a string object that will contain the final text the representation of the pdf. 
    '''
    out_text = StringIO()
    '''
    UTF-8 is one of the most commonly used encodings, and Python often defaults to using it.
    In our case, we are going to specify in order to avoid some encoding errors.
    '''
    codec = 'utf-8'
    """
    LAParams is the object containing the Layout parameters with a certain default value. 
    """
    laParams = LAParams()
    '''
    Create a TextConverter Object, taking :
    - ressource_manager,
    - out_text 
    - layout parameters.
    '''
    text_converter = TextConverter(resource_manager,
                                   out_text,
                                   laparams=laParams)
    fp = open(path_to_pdf, 'rb')
    '''
    Create a PDF interpreter object taking: 
    - ressource_manager 
    - text_converter
    '''
    interpreter = PDFPageInterpreter(resource_manager, text_converter)
    '''
    We are going to process the content of each page of the original PDF File
    '''
    for page in PDFPage.get_pages(fp,
                                  pagenos=set(),
                                  maxpages=0,
                                  password="",
                                  caching=True,
                                  check_extractable=True):
        interpreter.process_page(page)
    '''
    Retrieve the entire contents of the “file” at any time 
    before the StringIO object’s close() method is called.
    '''
    text = out_text.getvalue()
    '''
    Closing all the ressources we previously opened
    '''
    fp.close()
    text_converter.close()
    out_text.close()
    '''
    Return the final variable containing all the text of the PDF
    '''
    return text