def get_kondate_from_pdf(dir, year, month):
    # PDFを解析するために必要
    resource_manager = PDFResourceManager()
    layout_params = LAParams()
    layout_params.detect_vertical = True
    device = PDFPageAggregator(resource_manager, laparams=layout_params)

    # PDFファイルを開いてページ単位で読み込み
    file_path = dir + "/PDFData/" + str(year) + "/" + str(month).zfill(
        2) + ".pdf"
    if not os.path.exists(file_path):
        raise FileNotFoundError("指定された年月のPDFファイルが存在しません.")

    kondate_data_all = []
    with open(file_path, 'rb') as fp:
        interpreter = PDFPageInterpreter(resource_manager, device)

        for page in PDFPage.get_pages(fp,
                                      maxpages=0,
                                      caching=True,
                                      check_extractable=True):
            interpreter.process_page(page)
            result = device.get_result()

            text_boxes = find_textbox_recursively(result)
            text_boxes.sort(key=lambda b: (-b.y1, b.x0))

            parsed_data = parse_textboxes(text_boxes)
            kondate_data = get_kondate_from_parsed_data(year, parsed_data)
            kondate_data_all.extend(kondate_data)

    device.close()

    return kondate_data_all
Example #2
0
def outputText(inputPDFFile, outputTXTFile):
    # Open a PDF file.
    pageNum = 1
    fp = open(inputPDFFile, 'rb')

    rsrcmgr = PDFResourceManager()
    #    rettxt = output = StringIO()
    laparams = LAParams()
    # Output vertical writing characters horizontally
    laparams.detect_vertical = True
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    parser = PDFParser(fp)
    document = PDFDocument(parser, password)

    charBuf = []
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        layout = device.get_result()
        for l in layout:
            # print(l)
            checkLtFigure(l, pageNum, charBuf)
        # next page
        charBuf.append("\n\n")
        pageNum += 1
    # End of for page in PDFPage.create
    wfp = open(outputTXTFile, 'wt', encoding='UTF-8')
    buf = ''.join(charBuf)
    wfp.write(buf)
    wfp.close()

    fp.close()
    device.close()
def get_layout(url, pages=None):
    """
    The layout is an object of pdfminer corresponding to the tree structure of
    a pdf. More information about the layout here:
    http://www.unixuser.org/~euske/python/pdfminer/programming.html
    :param url: path (str) of the pdf file to be analysed
    :param pages: list (int) of pages of which you want the layout.
    Beware
    that
    the first page of the pdf correspond to number 0, even if its id is 1
    :return layouts: List of layouts (One layout per page).
    """
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    # Set parameters for analysis.
    laparams = LAParams()
    manager = PDFResourceManager()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(manager, laparams=laparams)
    interpreter = PDFPageInterpreter(manager, device)
    layouts = []
    with open(url, 'rb') as infile:
        for page in PDFPage.get_pages(infile, pagenos=pagenums):
            interpreter.process_page(page)
            layouts.append(device.get_result())
    device.close()

    return layouts
Example #4
0
File: runbot.py Project: er1k1/NLP
def read_pdf_by_line(pdfpath):
    rsrcmgr = PDFResourceManager()
    codec = 'utf-8'
    laparams = LAParams()
    fp = open(pdfpath, 'rb')
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    listout = []
    line_list = []

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)
        layout = device.get_result()
        # print(parse_layout(layout))
        listout, lines = parse_layout(layout, listout, line_list)
        # lines = parse_lines(layout)

        # print(listout)
    device.close()

    return listout, lines
def pdf_to_text(pdfname):
    fp = open(pdfname, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Process each page contained in the document.
    text = ""
    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine) \
                    or isinstance(lt_obj, LTText):
                text = text + '\n' + lt_obj.get_text()

    fp.close()
    device.close()
    return text.replace('\n\n', '\n')
Example #6
0
def get_layout(path):
	'''returns a list of every character in the document as well as its location'''

	rsrcmgr = PDFResourceManager()
	retstr = StringIO()
	codec = 'utf-8'
	laparams = LAParams()

	fp = file(path, 'rb')
	password = ""
	maxpages = 0
	caching = True
	pagenos=set()

	layout = []
	device = PDFPageAggregator(rsrcmgr, laparams=laparams)
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
		interpreter.process_page(page)
		layout.append(  device.get_result()  )
	fp.close()
	device.close()
	retstr.close()

	return layout
def get_layout(url, pages=None):
    """
    The layout is an object of pdfminer corresponding to the tree structure of
    a pdf. More information about the layout here:
    http://www.unixuser.org/~euske/python/pdfminer/programming.html
    :param url: path (str) of the pdf file to be analysed
    :param pages: list (int) of pages of which you want the layout.
    Beware
    that
    the first page of the pdf correspond to number 0, even if its id is 1
    :return layouts: List of layouts (One layout per page).
    """
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    # Set parameters for analysis.
    laparams = LAParams()
    manager = PDFResourceManager()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(manager, laparams=laparams)
    interpreter = PDFPageInterpreter(manager, device)
    layouts = []
    with open(url, 'rb') as infile:
        for page in PDFPage.get_pages(infile, pagenos=pagenums):
            interpreter.process_page(page)
            layouts.append(device.get_result())
    device.close()

    return layouts
Example #8
0
def extract_pdf(path, languages=None):
    """ Extract content from a PDF file. This will attempt to use PyPDF2
    to extract textual content first. If none is found, it'll send the file
    through OCR. """
    with open(path, 'rb') as fh:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        parser = PDFParser(fh)
        doc = PDFDocument(parser, '')
        result = {'pages': []}
        if len(doc.info):
            for k, v in doc.info[-1].items():
                k = k.lower().strip()
                if k != 'pages':
                    result[k] = safe_text(v)

        if not doc.is_extractable:
            log.warning("PDF not extractable: %s", path)
            return result

        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
            layout = device.get_result()
            text = _convert_page(layout, languages)
            result['pages'].append(text)
        device.close()
        return result
Example #9
0
def extract_pdf(path, languages=None):
    """
    Extract content from a PDF file.

    This will attempt to use pdfminer to extract textual content from
    each page. If none is found, it'll send the images through OCR.
    """
    fh = open(path, "rb")
    result = {"pages": []}
    try:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        parser = PDFParser(fh)
        doc = PDFDocument(parser, "")

        if len(doc.info):
            for k, v in doc.info[-1].items():
                k = k.lower().strip()
                v = string_value(v)
                if k != "pages" and v is not None and "<PDFObjRef:" not in v:
                    result[k] = string_value(v)

        for i, page in enumerate(PDFPage.create_pages(doc)):
            result["pages"].append(_convert_page(interpreter, page, device, i + 1, path, languages))
        device.close()
        return result
    except PSEOF as eof:
        log.info("Unexpected EOF: %r", eof)
        return result
    finally:
        fh.close()
Example #10
0
def convert(path, ignore=[], multipage=[]):
    rsrcmgr = PDFResourceManager()
    device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    imagewriter = ImageWriter(os.path.join(os.path.dirname(path), 'images'))
    fp = open(path, 'rb')
    slides = []
    slides_dir = lambda fname: os.path.join(os.path.dirname(path), 'slides',
                                            fname)
    if not os.path.exists(slides_dir("")):
        os.makedirs(slides_dir(""))
    for page_number, page in enumerate(PDFPage.get_pages(fp)):
        interpreter.process_page(page)
        layout = device.get_result()
        if page_number in multipage:
            slides += convert_multipage(page, layout, imagewriter,
                                        str(page_number), slides_dir)
        elif page_number not in ignore:
            slides.append(
                convert_page(page, layout, imagewriter, str(page_number),
                             slides_dir))
    fp.close()
    device.close()
    with open(os.path.join(os.path.dirname(path), "slides.js"), "w") as fp:
        fp.write("slides=JSON.parse(" + json.dumps(json.dumps(slides)) + ")")
Example #11
0
def parsePDF(pdfFile):
    # 以二进制读模式打开
    fp = open(pdfFile, 'rb')
    #用文件对象来创建一个pdf文档分析器
    praser = PDFParser(fp)

    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器 与文档对象
    praser.set_document(doc)
    doc.set_parser(praser)

    # 文本的list
    textlist = ['.']

    # 提供初始化密码
    # 如果没有密码 就创建一个空的字符串
    doc.initialize()

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        # raise PDFTextExtractionNotAllowed
        return None
    else:
        # 创建PDf 资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # 循环遍历列表,每次处理一个page的内容
        # doc.get_pages() 获取page列表
        for page in doc.get_pages():
            # 接受该页面的LTPage对象
            interpreter.process_page(page)
            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象
            # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等
            # 想要获取文本就获得对象的text属性,
            layout = device.get_result()

            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    # with open(r'../../data/pdf/1.txt', 'a') as f:
                    #     results = x.get_text()
                    #     print(results)
                    #     f.write(results + '\n')
                    results = x.get_text()
                    #print(results)
                    #print('---------------------------')
                    textlist.append(results)

        device.close()

    return textlist
Example #12
0
def parse_pdf(path, pages=134):
    global LastBOL

    try:
        pdf_file = open(path, 'rb')

        #Create PDF Parser
        rsrcmanager = PDFResourceManager()
        PDFPageAgg = PDFPageAggregator(rsrcmanager, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmanager, PDFPageAgg)
        for n, page in enumerate(
                PDFPage.get_pages(pdf_file,
                                  maxpages=pages,
                                  password="",
                                  caching=True)):
            n += 1
            print('###### page ' + str(n))
            myTEUs = 0
            myWeight = 0
            interpreter.process_page(page)
            # receive the LTPage object for the page.
            LTPage_layout = PDFPageAgg.get_result()
            myBOL = get_BOL_from_elements(LTPage_layout)
            if not myBOL:
                print("Empty Page " + "\n")
                continue

            if LastBOL == myBOL:
                myTEUs, myWeight = get_TEUs_from_elements(LTPage_layout)
                if myTEUs == 0:
                    print("Continue same BOL: " + myBOL + "\n")
                    continue
                else:
                    myManifest.TEUs = myTEUs
                    myManifest.WeightORG_Tonne = myWeight

            else:
                myManifest = Manifest()
                myManifest.BOL = myBOL
                get_text_from_elements(LTPage_layout, myManifest)

            if myManifest.TEUs > 0:
                myManifest.SetCalculatedValues()
                myManifest.myPrint()
                Manifests.append(myManifest.get_list())
                print("-------------------------------------")
            else:
                print("Continue same BOL: " + myBOL + "\n")
            myTEUs = 0
            LastBOL = myBOL

        pdf_file.close()
        PDFPageAgg.close()
    except Exception:
        traceback.print_exc()
Example #13
0
def parse(file_path):
    # file_path = folder_path + "半年报问询函_2018-09-10_300275_梅安森_NMK30027523331945HF.pdf"
    dest_path = file_path.replace(folder_path,
                                  txt_folder_path).replace("pdf", "txt")
    if os.path.exists(dest_path) and os.path.getsize(dest_path) > 1024:
        print("文件[{}]已存在\n".format(dest_path))
        return
    L = []
    type = file_path.split(".")[-1]
    if type == 'pdf':
        with open(file_path, 'rb') as fp:  # 以二进制读模式打开
            # 用文件对象来创建一个pdf文档分析器
            parser = PDFParser(fp)
            # 连接分析器 与文档对象
            doc = PDFDocument()
            parser.set_document(doc)  # 创建一个PDF文档
            # 创建PDf 资源管理器 来管理共享资源
            rsrcmgr = PDFResourceManager()
            # 创建一个PDF设备对象
            laparams = LAParams()
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            try:
                doc.set_parser(parser)
                # 提供初始化密码
                # 如果没有密码 就创建一个空的字符串
                doc.initialize()

                # 创建一个PDF解释器对象
                interpreter = PDFPageInterpreter(rsrcmgr, device)

                # 检测文档是否提供txt转换,不提供就忽略
                if not doc.is_extractable:
                    raise PDFTextExtractionNotAllowed
                else:
                    # 循环遍历列表,每次处理一个page的内容
                    for page in doc.get_pages():  # doc.get_pages() 获取page列表
                        interpreter.process_page(page)
                        # 接受该页面的LTPage对象
                        layout = device.get_result()
                        # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
                        for x in layout:
                            if isinstance(x, LTTextBoxHorizontal):
                                L.append(x.get_text())
                                print(x.get_text())
            except Exception as exc:
                print("文件{},{}\n".format(file_path, exc))
            finally:
                device.close()
    # else:  # 为doc文件
    #     with open(file_path, 'r', encoding='utf8') as fp:
    #         for line in fp.readlines():
    #             L.append(line)

    file2txt(dest_path, L)
Example #14
0
 def get_layouts_to_analyse(self): #<class 'pdfminer.layout.LTPage'>.
     layouts = []
     rsrcmgr,retstr = PDFResourceManager(), StringIO()
     device = PDFPageAggregator(rsrcmgr, laparams=LAParams()) #MAIN DIFFERENCE WITH CONVERT
     with file(self.path, 'rb') as fp:
         interpreter = PDFPageInterpreter(rsrcmgr,device)
         for page in PDFPage.get_pages(fp,set(),check_extractable=True): #set(),maxpages=0, password="",caching=True, 
             interpreter.process_page(page)
             layouts.append(device.get_result())
     device.close()
     result = retstr.getvalue()
     retstr.close()
     print result
     return layouts
Example #15
0
    def convert_pdf_2_text(self, path):

        rsrcmgr = PDFResourceManager()
        laparams = LAParams()

        device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        fp = open(path, 'rb')
        parser = PDFParser(fp)
        document = PDFDocument(parser)

        
        layout_list = []

        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
            layout_dict = defaultdict(list)
            for x in layout:
                try:

                    t = x.get_text().strip()
                    #help(x)
                    #exit()
                    #axis = "%s_%s" % (x.__dict__['x0'], x.__dict__['x1'])
                    axis = "%s" % (x.__dict__['y0'])
                    #print x
                    layout_dict[axis].append((x.__dict__['x0'], t))
                    #print x,t
                    
                except Exception as e:
                    pass
                    #print e
            
            new_layout_dict = defaultdict(list)
            for key in layout_dict:
                sorted_list = sorted(layout_dict[key], key = lambda x:x[0])
                #print 'sorted list'
                #print sorted_list
                new_layout_dict[key] = [x[1] for x in sorted_list]
                #print key, new_layout_dict[key]

            ll = sorted(new_layout_dict.items(), key = lambda x:-float(x[0]))
            layout_list.extend(ll)
        device.close()

        #print layout_dict
        return layout_list
Example #16
0
 def Process_task (self, _file_dir, _csv_file,debug=False):
     with open(_csv_file,'w',newline='') as csvfile:
         skywriter = csv.writer(csvfile,dialect='excel')
         for root, dirs, files in os.walk(_file_dir):
             for name in files:
                 if os.path.join(root, name).endswith('.pdf'):
                     logger.info('Deal with file : ' + name + '.')
                     files_path = os.path.join(root, name)
                     # 以二进制读模式打开pdf
                     fp = open(files_path, 'rb')
                     # 用文件对象创建一个pdf文档分析器
                     praser = PDFParser(fp)
                     # 创建一个PDF文档
                     doc = PDFDocument()
                     # 链接分析器与文档对象
                     praser.set_document(doc)
                     doc.set_parser(praser)
                     # 创建pdf资源管理器
                     rsrcmgr = PDFResourceManager()
                     # 创建一个pdf设备对象
                     laparm = LAParams()
                     device = PDFPageAggregator(rsrcmgr, laparams=laparm)
                     # 创建一个pdf解释器对象
                     interpreter = PDFPageInterpreter(rsrcmgr, device)
                     for pages in doc.get_pages():
                         all_text = []
                         interpreter.process_page(pages)
                         layout = device.get_result()
                         for x in layout:
                             if isinstance(x, LTTextBoxHorizontal):
                                 try:
                                     results = x.get_text()
                                     all_text.append(results)
                                 except AttributeError:
                                     continue
                     #print(all_text)
                     distance_index = all_text.index('距离距离\n')
                     loss_index = all_text.index('激光器 nm\n1310\n')
                     distance = (all_text[distance_index + 1].split('\n'))[-2]
                     loss = (all_text[loss_index + 1].split('\n'))[-2]
                     #print(distance,loss)
                     fname = name.replace('Fiber', '').replace('_1310OE.sor.pdf', '')
                     skywriter.writerow([fname, distance, loss])
                     device.close()
                     fp.close()
                     if debug:
                         print(fname,distance,loss)
     csvfile.close()
def convert_article(path):

	rsrcmgr = PDFResourceManager()
	laparams = LAParams()
	device = PDFPageAggregator(rsrcmgr, laparams=laparams)

	fp = file(path, 'rb')
	parser = PDFParser(fp)
	doc = PDFDocument(parser)
	parser.set_document(doc)

	interpreter = PDFPageInterpreter(rsrcmgr, device)
	maxpages = 0
	caching = True
	pagenos=set()
	
	metadata = { 'Title': '' , 'Author': '', 'Abstract': '' }
	
	# Get title and author from pdf metadata
	for meta in doc.info:
		try:
			title = ''.join(c if ord(c) < 129 and ord(c) > 0 else '' for c in meta['Title']) # Only ASCII characters
			author = ''.join(c if ord(c) < 129 and ord(c) > 0 else '' for c in meta['Author']) # Only ASCII characters
			title = title.decode(default_encoding)
			author = author.decode(default_encoding)
			metadata = { "Title": title, "Author": author, "Abstract": '', "Date": '' }
		except:
			continue
			
	# Parse pdf file
	date = []
	for index, page in enumerate(PDFPage.get_pages(fp, pagenos, maxpages=maxpages, 
				  caching=caching,check_extractable=True)):
		interpreter.process_page(page)
		pdfdata = device.get_result()
		date.extend(get_date(pdfdata))

		# Abstract should be published on first page
		if index == 0:
			abstract = get_abstract(pdfdata)
			metadata["Abstract"] = abstract	

	metadata["Date"] = date[0].year # First detected date
		
	fp.close()
	device.close()
	return metadata
Example #18
0
def extract_pdf(path, languages=None):
    """
    Extract content from a PDF file.

    This will attempt to use pdfminer to extract textual content from
    each page. If none is found, it'll send the images through OCR.
    """
    with open(path, "rb") as fh:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        parser = PDFParser(fh)
        try:
            doc = PDFDocument(parser, "")
        except PDFSyntaxError as pse:
            if "No /Root object!" in pse.message:
                log.info("Invalid PDF file: %r", path)
                return None
            raise

        result = {"pages": []}
        if len(doc.info):
            for k, v in doc.info[-1].items():
                k = k.lower().strip()
                if k != "pages":
                    result[k] = safe_text(v)

        if not doc.is_extractable:
            log.warning("PDF not extractable: %s", path)
            return result

        for i, page in enumerate(PDFPage.create_pages(doc)):
            text = None
            try:
                interpreter.process_page(page)
                layout = device.get_result()
                text = _convert_page(layout, path)
            except Exception as ex:
                log.warning("Failed to parse PDF page: %r", ex)

            if text is None or len(text) < 3:
                log.debug("Defaulting to OCR: %r, pg. %s", path, i + 1)
                text = _extract_image_page(path, i + 1, languages)
            result["pages"].append(text)
        device.close()
        return result
Example #19
0
def extract_pdf(path, languages=None):
    """
    Extract content from a PDF file.

    This will attempt to use pdfminer to extract textual content from
    each page. If none is found, it'll send the images through OCR.
    """
    with open(path, 'rb') as fh:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        parser = PDFParser(fh)
        try:
            doc = PDFDocument(parser, '')
        except PDFSyntaxError as pse:
            if 'No /Root object!' in pse.message:
                log.info("Invalid PDF file: %r", path)
                return None
            raise

        result = {'pages': []}
        if len(doc.info):
            for k, v in doc.info[-1].items():
                k = k.lower().strip()
                if k != 'pages':
                    result[k] = safe_text(v)

        if not doc.is_extractable:
            log.warning("PDF not extractable: %s", path)
            return result

        for i, page in enumerate(PDFPage.create_pages(doc)):
            text = None
            try:
                interpreter.process_page(page)
                layout = device.get_result()
                text = _convert_page(layout, path)
            except Exception as ex:
                log.warning("Failed to parse PDF page: %r", ex)

            if text is None or len(text) < 3:
                log.debug("Defaulting to OCR: %r, pg. %s", path, i + 1)
                text = _extract_image_page(path, i + 1, languages)
            result['pages'].append(text)
        device.close()
        return result
Example #20
0
    def _pdf_to_text(self, pdf_path, text_path):
        """
        This method does the actual text extraction. It uses PdfMiner Python
        library to do the extraction.
        :param pdf_path: path to the input PDF
        :param text_path: path to the output text
        :return: True if successful, False otherwise
        """
        text = ''
        num_pages = 0
        doc = PDFDocument()
        res_mgr = PDFResourceManager()

        device = PDFPageAggregator(res_mgr, laparams=LAParams())
        interpreter = PDFPageInterpreter(res_mgr, device)

        try:
            with open(pdf_path, 'rb') as fp:
                parser = PDFParser(fp)
                parser.set_document(doc)
                doc.set_parser(parser)
                doc.initialize('')
                for page in doc.get_pages():
                    self._logger.debug('Processing page {}'.format(num_pages +
                                                                   1))
                    interpreter.process_page(page)
                    layout = device.get_result()
                    for lt_obj in layout:
                        if isinstance(lt_obj, LTTextBox) \
                                or isinstance(lt_obj, LTTextLine):
                            # print(lt_obj.get_text())
                            text += lt_obj.get_text()
                    num_pages += 1
                self._logger.info('Done, extracted {} pages'.format(num_pages))
                self._logger.debug('Storing result in {}'.format(text_path))
                with open(text_path, 'w') as text_fp:
                    text_fp.write(text.strip())
        except:
            self._logger.warning(
                'Extracting text from {} failed'.format(pdf_path))
            return False
        finally:
            # close resources before exiting
            device.close()
        return text is not None and len(text)
def parseAllPagesWithImages(pdfPath, outputImageFolder):
    resourceMgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    textDevice = TextConverter(resourceMgr,
                               retstr,
                               codec=codec,
                               laparams=laparams)
    device = PDFPageAggregator(resourceMgr, laparams=laparams)
    generalInterp = PDFPageInterpreter(resourceMgr, device)
    textInterp = PDFPageInterpreter(resourceMgr, textDevice)
    fp = open(pdfPath, 'rb')
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()
    imageCount = 1
    match = re.search('/.+/(.+)\.pdf', pdfPath)
    paperName = match.group(1)

    for i, page in enumerate(
            PDFPage.get_pages(fp,
                              pagenos,
                              maxpages=maxpages,
                              password=password,
                              caching=caching,
                              check_extractable=True)):
        generalInterp.process_page(page)
        textInterp.process_page(page)
        layout = device.get_result()
        # pdb.set_trace() # Start debugging
        imageCount = parseLayout(layout, outputImageFolder, imageCount)
        text = retstr.getvalue()
        txtFile = open(
            '' + outputImageFolder + '/Paper' + paperName + 'Page' +
            str(i + 1) + '.txt', 'w')
        txtFile.write(text)
        txtFile.close()

    fp.close()
    device.close()
    textDevice.close()
    retstr.close()
Example #22
0
def ReadPDF(pdf_path, password=''):
    """提取 PDF 文本内容, 输出为 str.

    maxpages = 0
    caching = True
    pagenos = set()
    """
    try:
        fp = open(pdf_path, 'rb')
        parser = PDFParser(fp)
        document = PDFDocument()

        # Connect the parser and document objects
        parser.set_document(document)
        document.set_parser(parser)

        # Supply the password for initialization (or empty string)
        document.initialize(password)

        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed

        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        text_content = []

        # process each page contained in the document
        for page in document.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            for layo in layout:
                if (isinstance(layo, LTTextBoxHorizontal)):
                    text_content.append(layo.get_text())

        device.close()
    except IOError as e:
        # error
        pass
    return text_content
Example #23
0
def extract_pdf(fp):
    """
    Extract text from PDF file.
    """
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    text_content = [] # a list of strings, each representing text collected from each page of the pdf
    for page in PDFPage.create_pages(PDFDocument(PDFParser(fp))):
        interpreter.process_page(page) # LTPage object for this page
        layout = device.get_result() # layout is an LTPage object which may contain child objects
        for lt_obj in layout: # extract text from text objects
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                if isinstance(lt_obj.get_text(), str):
                    text_content.append(lt_obj.get_text())
                else:
                    text_content.append(lt_obj.get_text().decode())
    device.close()
    return '\n\n'.join(text_content)
Example #24
0
def extract_pdf(fp):
    """
    Extract text from PDF file.
    """
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    text_content = [] # a list of strings, each representing text collected from each page of the pdf
    for page in PDFPage.create_pages(PDFDocument(PDFParser(fp))):
        interpreter.process_page(page) # LTPage object for this page
        layout = device.get_result() # layout is an LTPage object which may contain child objects
        for lt_obj in layout: # extract text from text objects
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                if isinstance(lt_obj.get_text(), str):
                    text_content.append(lt_obj.get_text())
                else:
                    text_content.append(lt_obj.get_text().decode())
    device.close()
    return '\n\n'.join(text_content)
Example #25
0
def get_higaijokyo_content():
    url = generate_url()
    rsrcmgr = PDFResourceManager()

    req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    try:
        content = urllib.request.urlopen(req)
    except HTTPError:
        print('PDFがありません')
        return

    with open('data', 'wb') as output:
        output.write(content.read())
    fp = open('data', 'rb')
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    for page in PDFPage.get_pages(fp,
                                  pagenos=None,
                                  maxpages=0,
                                  password=None,
                                  caching=True,
                                  check_extractable=True):
        interpreter.process_page(page)
        layout = device.get_result()
        prev_node = None
        str_line = ''
        for node in layout:
            if isinstance(node, LTTextBoxHorizontal):
                if (prev_node and prev_node.y1 == node.y1):
                    str_line += ','
                    str_line += node.get_text().strip()
                else:
                    print(str_line)
                    str_line = node.get_text().strip()
                prev_node = node

    fp.close()
    device.close()
Example #26
0
def extract_pdf(path, languages=None):
    """
    Extract content from a PDF file.

    This will attempt to use pdfminer to extract textual content from
    each page. If none is found, it'll send the images through OCR.
    """
    with open(path, 'rb') as fh:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        parser = PDFParser(fh)
        doc = PDFDocument(parser, '')

        result = {'pages': []}
        if len(doc.info):
            for k, v in doc.info[-1].items():
                k = k.lower().strip()
                v = string_value(v)
                if k != 'pages' and v is not None and '<PDFObjRef:' not in v:
                    result[k] = string_value(v)

        for i, page in enumerate(PDFPage.create_pages(doc)):
            text = None
            try:
                interpreter.process_page(page)
                layout = device.get_result()
                text = _convert_page(layout, path)
            except Exception as ex:
                log.warning("Failed to parse PDF page: %r", ex)

            if text is None or len(text) < 3:
                log.info("OCR: %r, pg. %s", path, i + 1)
                text = _extract_image_page(path, i + 1, languages)
            result['pages'].append(text)
        device.close()
        return result
Example #27
0
def extract_pdf(path, languages=None):
    """
    Extract content from a PDF file.

    This will attempt to use pdfminer to extract textual content from
    each page. If none is found, it'll send the images through OCR.
    """
    with open(path, 'rb') as fh:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        parser = PDFParser(fh)
        doc = PDFDocument(parser, '')

        result = {'pages': []}
        if len(doc.info):
            for k, v in doc.info[-1].items():
                k = k.lower().strip()
                v = string_value(v)
                if k != 'pages' and v is not None and '<PDFObjRef:' not in v:
                    result[k] = string_value(v)

        for i, page in enumerate(PDFPage.create_pages(doc)):
            text = None
            try:
                interpreter.process_page(page)
                layout = device.get_result()
                text = _convert_page(layout, path)
            except Exception as ex:
                log.warning("Failed to parse PDF page: %r", ex)

            if text is None or len(text) < 3:
                log.info("OCR: %r, pg. %s", path, i + 1)
                text = _extract_image_page(path, i + 1, languages)
            result['pages'].append(text)
        device.close()
        return result
Example #28
0
def to_text(path):
    """Wrapper around pdfminer. Returns whole text as first value, pdf
    layouts with corresponding pages as second"""
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    laparams.all_texts = False
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    deviceLayout = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreterLayout = PDFPageInterpreter(rsrcmgr, deviceLayout)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()
    pages = PDFPage.get_pages(fp,
                              pagenos,
                              maxpages=maxpages,
                              password=password,
                              caching=caching,
                              check_extractable=True)
    objects = []
    for page_n, page in enumerate(pages):
        interpreter.process_page(page)
        interpreterLayout.process_page(page)
        layout = deviceLayout.get_result()
        objects.append((content_from_layout(layout), page_n))

    fp.close()
    device.close()
    deviceLayout.close()
    string = retstr.getvalue()
    retstr.close()
    return string, objects
Example #29
0
    print str(page.doc._cached_objs)
    data2 = []
    images = []
    for obj in layout:
        sub_data, sub_images = parse_lt_objs(obj, page_count)
        data2 += sub_data
        images += sub_images

    print "Page " + str(page_count) + ", images: " + str(len(images))
    print " got return: \n" + "\n".join(data2)

    pdf.add_page()
    pdf.write(14, "PAGE: " + str(page_count))
    pdf.ln(2)

    new_page = data2  # filter_lines(data2)

    # print "Page " + str(page_count)
    # print "Result: " + "\n".join(new_page)
    for line in new_page:
        pdf.write(font_size, line)
        pdf.ln(font_size / 2)
    page_count += 1
    if page_count == 3:
        break

device.close()

pdf.output(options.out_file, "F")
fp.close()
Example #30
0
def text_from_pdf(pdf_path, authors):
    extracted_text = ""
    affiliations = dict()
    author_lastnames = set()
    author_lastname_pattern = ""
    for author in authors:
        lastname = get_last_name(author[1])
        print("Lastname:", lastname)
        author_lastnames.add(lastname)
        author_lastname_pattern += lastname + "\s*,?\s*|"
    author_lastname_pattern = "(" + author_lastname_pattern[:-1] + ")"
    print("Author lastname pattern:", author_lastname_pattern)
    # Create a PDF parser object associated with the file object.
    infp = open(pdf_path, "rb")
    parser = PDFParser(infp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser)
    # Set parameters for analysis.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    rsrcmgr = PDFResourceManager(caching=True)
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                tmp_text = re.sub("[^A-Za-z0-9 \s\.@]", "", lt_obj.get_text())
                author_count = 0
                for lastname in author_lastnames:
                    if re.search(lastname, tmp_text): author_count += 1
                if author_count > 0:
                    print("Author text block: ", tmp_text)
                    tmp_pattern = author_lastname_pattern + "{" + str(
                        author_count) + "}(.*)$"
                    affiliation_block = re.search(tmp_pattern, tmp_text,
                                                  re.DOTALL)
                    if affiliation_block:
                        print("Groups: ", affiliation_block.groups())
                        for lastname in author_lastnames:
                            if (not lastname in affiliations) and re.search(
                                    lastname, tmp_text):
                                if (len(affiliation_block.groups()) >
                                        author_count):
                                    affiliations[lastname] = re.sub(
                                        "\s+", " ",
                                        affiliation_block.group(author_count +
                                                                1))
                                else:
                                    affiliations[lastname] = re.sub(
                                        "\s+", " ", tmp_text)
                extracted_text += tmp_text + "\n"
        infp.close()
        device.close()
        if os.path.exists("working/temp"):
            os.remove("working/temp")
        outfp = open(temp_path, "w", encoding="utf-8")
        outfp.write(extracted_text)
        outfp.close()
        #os.remove(temp_path)
        return (extracted_text, affiliations)
Example #31
0
def get_pdf_rows(data, miner_layout=True):
    """
    Takes PDF file content as string and yield table row data for each page.

    For each page in the PDF, the function yields a list of rows.
    Each row is a list of cells. Each cell is a list of strings present in the cell.
    Note that the rows may belong to different tables.

    There are no logic tables in PDF format, so this parses PDF drawing instructions
    and tries to find rectangles and arrange them in rows, then arrange text in
    the rectangles.

    External dependencies:
    PDFMiner (http://www.unixuser.org/~euske/python/pdfminer/index.html).
    """

    try:
        from pdfminer.pdfparser import PDFParser, PDFSyntaxError
    except ImportError:
        raise ImportError('Please install python-pdfminer')

    try:
        from pdfminer.pdfdocument import PDFDocument
        from pdfminer.pdfpage import PDFPage
        newapi = True
    except ImportError:
        from pdfminer.pdfparser import PDFDocument
        newapi = False
    from pdfminer.converter import PDFPageAggregator
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.layout import LAParams, LTRect, LTTextBox, LTTextLine, LTLine, LTChar, LTCurve

    parser = PDFParser(BytesIO(data))
    try:
        if newapi:
            doc = PDFDocument(parser)
        else:
            doc = PDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
    except PDFSyntaxError:
        return

    rsrcmgr = PDFResourceManager()
    if miner_layout:
        device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
    else:
        device = PDFPageAggregator(rsrcmgr)

    interpreter = PDFPageInterpreter(rsrcmgr, device)
    if newapi:
        pages = PDFPage.get_pages(BytesIO(data), check_extractable=True)
    else:
        doc.initialize()
        pages = doc.get_pages()

    if LOGGER.isEnabledFor(DEBUGFILES):
        import tempfile
        import PIL.Image as Image
        import PIL.ImageDraw as ImageDraw
        import random

        path = tempfile.mkdtemp(prefix='pdf')

    for npage, page in enumerate(pages):
        LOGGER.debug('processing page %s', npage)
        interpreter.process_page(page)
        page_layout = device.get_result()

        texts = sum([list(lttext_to_multilines(obj, page_layout)) for obj in page_layout._objs if isinstance(obj, (LTTextBox, LTTextLine, LTChar))], [])
        LOGGER.debug('found %d text objects', len(texts))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for t in texts:
                color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255))
                draw.rectangle((t.x0, t.y0, t.x1, t.y1), outline=color)
                draw.text((t.x0, t.y0), t.text.encode('utf-8'), color)
            fpath = '%s/1text-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        if not miner_layout:
            texts.sort(key=lambda t: (t.y0, t.x0))

        # TODO filter ltcurves that are not lines?
        # TODO convert rects to 4 lines?
        lines = [lt_to_coords(obj, page_layout) for obj in page_layout._objs if isinstance(obj, (LTRect, LTLine, LTCurve))]
        LOGGER.debug('found %d lines', len(lines))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for l in lines:
                color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255))
                draw.rectangle((l.x0, l.y0, l.x1, l.y1), outline=color)
            fpath = '%s/2lines-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        lines = list(uniq_lines(lines))
        LOGGER.debug('found %d unique lines', len(lines))

        rows = build_rows(lines)
        LOGGER.debug('built %d rows (%d boxes)', len(rows), sum(len(row) for row in rows))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for r in rows:
                for b in r:
                    color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255))
                    draw.rectangle((b.x0 + 1, b.y0 + 1, b.x1 - 1, b.y1 - 1), outline=color)
            fpath = '%s/3rows-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        textrows = arrange_texts_in_rows(rows, texts)
        LOGGER.debug('assigned %d strings', sum(sum(len(c) for c in r) for r in textrows))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for row, trow in zip(rows, textrows):
                for b, tlines in zip(row, trow):
                    color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255))
                    draw.rectangle((b.x0 + 1, b.y0 + 1, b.x1 - 1, b.y1 - 1), outline=color)
                    draw.text((b.x0 + 1, b.y0 + 1), '\n'.join(tlines).encode('utf-8'), color)
            fpath = '%s/4cells-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        yield textrows
    device.close()
Example #32
0
def get_pdf_rows(data, miner_layout=True):
    """
    Takes PDF file content as string and yield table row data for each page.

    For each page in the PDF, the function yields a list of rows.
    Each row is a list of cells. Each cell is a list of strings present in the cell.
    Note that the rows may belong to different tables.

    There are no logic tables in PDF format, so this parses PDF drawing instructions
    and tries to find rectangles and arrange them in rows, then arrange text in
    the rectangles.

    External dependencies:
    PDFMiner (http://www.unixuser.org/~euske/python/pdfminer/index.html).
    """

    try:
        from pdfminer.pdfparser import PDFParser, PDFSyntaxError
    except ImportError:
        raise ImportError('Please install python-pdfminer')

    try:
        from pdfminer.pdfdocument import PDFDocument
        from pdfminer.pdfpage import PDFPage
        newapi = True
    except ImportError:
        from pdfminer.pdfparser import PDFDocument
        newapi = False
    from pdfminer.converter import PDFPageAggregator
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.layout import LAParams, LTRect, LTTextBox, LTTextLine, LTLine, LTChar

    parser = PDFParser(BytesIO(data))
    try:
        if newapi:
            doc = PDFDocument(parser)
        else:
            doc = PDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
    except PDFSyntaxError:
        return

    rsrcmgr = PDFResourceManager()
    if miner_layout:
        device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
    else:
        device = PDFPageAggregator(rsrcmgr)

    interpreter = PDFPageInterpreter(rsrcmgr, device)
    if newapi:
        pages = PDFPage.get_pages(BytesIO(data), check_extractable=True)
    else:
        doc.initialize()
        pages = doc.get_pages()

    for npage, page in enumerate(pages):
        interpreter.process_page(page)
        page_layout = device.get_result()

        texts = sum([
            list(lttext_to_multilines(obj, page_layout))
            for obj in page_layout._objs
            if isinstance(obj, (LTTextBox, LTTextLine, LTChar))
        ], [])
        if not miner_layout:
            texts.sort(key=lambda t: (t.y0, t.x0))

        lines = list(
            uniq_lines(
                lt_to_coords(obj, page_layout) for obj in page_layout._objs
                if isinstance(obj, (LTRect, LTLine))))

        boxes = build_rows(lines)
        textrows = arrange_texts_in_rows(boxes, texts)

        yield textrows
    device.close()
Example #33
0
def convert_pdf_to_txt_layout(path, allowed_areas):
    """
    Converts PDF to text using the pdfminer library
    """
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = "utf-8"
    laparams = LAParams(line_margin=0.2)
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    file_handle = file(path, "rb")
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()
    non_printable_re = re.compile(ur"(?:%s)" % "|".join([chr(i) for i in range(1, 31) if i != 10]))
    txt = []

    for page in PDFPage.get_pages(
        file_handle, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True
    ):
        interpreter.process_page(page)
        layout = device.get_result()
        ignore = []

        # Identify areas to ignore: Tables and Images
        for obj in layout:
            if obj.x0 < 0 or obj.y0 < 0 or obj.x1 > page.mediabox[2] or obj.y1 > page.mediabox[3]:
                continue
            if type(obj) == LTRect or type(obj) == LTFigure:
                bbox = obj.bbox
                ignore = get_bound(ignore, bbox)
            if type(obj) == LTCurve:
                # Consider only horizontal or vertical lines
                x0, y0 = obj._pts[0]
                for x, y in obj._pts[1:]:
                    if x == x0 or y == y0:
                        ignore = get_bound(ignore, (x0, y0, x, y))
                    x0, y0 = x, y

        # Gather text from non ignored regions
        for area in allowed_areas:
            for obj in layout:
                # Print only Horizontal Text Boxes
                if type(obj) != LTTextBoxHorizontal:
                    continue

                # Checks if object is within allowed areas
                if box_outside(area, obj.bbox):
                    continue

                # Checks if object is within ignored areas
                if ignore:
                    ignore_obj = reduce(lambda x, y: x or y, [box_intercepts(obj.bbox, region) for region in ignore])
                    if ignore_obj:
                        continue

                obj_txt = obj.get_text()
                obj_txt = non_printable_re.sub("", obj_txt)
                obj_txt = obj_txt.strip()

                txt.append(u" ".join(obj_txt.split("\n")))

    text = u"\n".join(txt)

    file_handle.close()
    device.close()
    retstr.close()
    return text
Example #34
0
    # Cover and index pages ignored.
    if pN > 1:
        interpreter.process_page(page)
        layout = device.get_result()
        # print(pN)
        header_threshold = 0  # round(height * threshold_rate)
        width = round(page.mediabox[2])
        height = round(page.mediabox[3])
        col_width_divider = width / col_count
        footer_threshold = round(height * (1 - threshold_rate))
        footer_threshold_2 = round(height * (1 - threshold_rate * 2))
        text_boxes = process_layout(layout, pN)
        pages.append(text_boxes)

fp.close()
device.close()
del pagenos

################################################################################

print("started to process: ", pdf_file_name)

bottom_items = []
sizes = []

top = height * 0.1
bottom = height - 100  # height * 0.85

# LAYOUT ANALYSIS
for page in pages:
    for tb in page:
Example #35
0
def extract_text_from_pdf(pdf_path):
    new_dict = {}  #To store extracted data as key, value pairs
    lines = []  #To store data alternatively as list
    counter = 1  #increments when <END> of block is reached
    a = []  #dummy array to append elements of any section

    #Reset switches for data
    table = False
    partNo = False
    notes = False
    qty = False
    partname = False
    see = False
    ending = True
    new_dict["Metadata_%d" % counter] = {}

    #PDF Miner Objects
    resource_manager = PDFResourceManager()
    fake_file_handle = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    converter = PDFPageAggregator(resource_manager, laparams=laparams)
    page_interpreter = PDFPageInterpreter(resource_manager, converter)

    w = 0

    with open(pdf_path, 'rb') as fh:
        #text_from_pdf = open('text2FromPdf.txt','w')
        for pageNumber, page in enumerate(
                PDFPage.get_pages(fh, caching=True, check_extractable=True)):
            if pageNumber > 31 and pageNumber < 923:
                page_interpreter.process_page(page)
                layout = converter.get_result()
                for element in layout:
                    if isinstance(element, LTTextBox) or isinstance(
                            element, LTTextLine):
                        lines.extend(element.get_text().strip())

                        if (element.get_text().split()[0] == "<END>"):
                            #print(element.get_text().strip())
                            a = []
                            table = False
                            #see=False
                            #notes=False
                            #partNo=False
                            #qty=False
                            #partname=False
                            ending = True
                            pno = new_dict["Metadata_%d" %
                                           counter]["Part Numbers"][
                                               1:]  #Delete??,check last iter
                            #new_dict["Metadata_%d" %counter]["Part Numbers"]=pno
                            check = len(pno)
                            q = new_dict["Metadata_%d" %
                                         counter]["QTYs"][1:check + 1]
                            #new_dict["Metadata_%d" %counter]["QTYs"]=q
                            pname = new_dict["Metadata_%d" %
                                             counter]["PART NAMEs"][3:check +
                                                                    3]
                            #new_dict["Metadata_%d" %counter]["PART NAMEs"]=pname
                            if (len(pno) == len(q) and len(pno) == len(pname)):
                                new_dict3 = {
                                    i: {
                                        "q": j,
                                        "p/n": k
                                    }
                                    for i, j, k in zip(pname, q, pno)
                                }
                                #print(new_dict3)
                                new_dict[
                                    "Metadata_%d" %
                                    counter]["Parts/Components"] = new_dict3

                            #Delete table columns
                            #del new_dict["Metadata_%d" %counter]["Notes"]
                            del new_dict["Metadata_%d" %
                                         counter]["Part Numbers"]
                            del new_dict["Metadata_%d" % counter]["QTYs"]
                            del new_dict["Metadata_%d" % counter]["PART NAMEs"]

                            #replace metadata_counter with system name

                            super_list = new_dict["Metadata_%d" %
                                                  counter]["Super"]
                            sup_idx = [
                                i for i, item in enumerate(super_list)
                                if re.search('^PART', item)
                            ]
                            sn_idx = [
                                i for i, item in enumerate(super_list)
                                if re.search('^S/N', item)
                            ]

                            if sup_idx:
                                sup_idx = sup_idx[0]
                                #print(sup_idx)
                                #print(new_dict["Metadata_%d" %counter]["Super"][sup_idx])
                                new_dict["Metadata_%d" %
                                         counter]["Top"] = new_dict[
                                             "Metadata_%d" %
                                             counter]["Super"][sup_idx]
                            if sn_idx:
                                sn_idx = sn_idx[0]
                                #print(sn_idx)
                                this_idx = sn_idx - 1
                                new_dict["Metadata_%d" %
                                         counter]["Serial_No"] = new_dict[
                                             "Metadata_%d" %
                                             counter]["Super"][sn_idx]
                                newkey = new_dict["Metadata_%d" %
                                                  counter]["Super"][this_idx]
                                new_dict["Metadata_%d" %
                                         counter]["Component"] = newkey
                                new_dict[newkey] = new_dict["Metadata_%d" %
                                                            counter]
                                del new_dict["Metadata_%d" % counter]["Super"]
                                del new_dict["Metadata_%d" % counter]

                            counter = counter + 1
                            new_dict["Metadata_%d" % counter] = {}

                        elif (element.get_text().strip()) == "NOTE":

                            a = []
                            table = True
                            notes = True
                            partNo = False
                            qty = False
                            partname = False
                            see = False
                            ending = False

                        elif (element.get_text().split()[0]) == "PART" and len(
                                element.get_text().split()) > 1:
                            if (element.get_text().split()[1]) == "NUMBER":
                                a = []
                                partNo = True
                                table = True
                                notes = False
                                qty = False
                                see = False
                                ending = False
                            elif (element.get_text().split()[1]) == "NAME":
                                a = []
                                partname = True
                                table = True
                                notes = False
                                qty = False
                                see = False
                                ending = False

                        elif (element.get_text().strip().split()[0]) == "QTY":
                            a = []
                            qty = True
                            partNo = False
                            notes = False
                            partname = False
                            see = False
                            table = True
                            ending = False

                        #elif(element.get_text().strip())=="PART NAME":
                        # a=[];
                        #partname=True
                        #partNo=False
                        #notes=False
                        #qty=False
                        #see=False

                        elif (element.get_text().strip().split()[0]) == "SEE":
                            #print(element.get_text().strip().split()[0])
                            a = []
                            see = True
                            partNo = False
                            notes = False
                            qty = False
                            partname = False
                            table = True
                            ending = False

                        if table == False and element.get_text().split(
                        )[0] != "<END>":
                            a.append(element.get_text().strip())
                            new_dict["Metadata_%d" % counter]["Super"] = a
                        w = w + 1
                        if notes and table:
                            a.extend(element.get_text().strip().split('\n'))
                            #new_dict["Metadata_%d" %counter]["Notes"]=a
                        if partNo and table:
                            a.extend(element.get_text().strip().split('\n'))
                            new_dict["Metadata_%d" %
                                     counter]["Part Numbers"] = a
                        if qty and table:
                            a.extend(element.get_text().strip().split('\n'))
                            new_dict["Metadata_%d" % counter]["QTYs"] = a
                        if partname and table:
                            a.extend(element.get_text().strip().split('\n'))
                            new_dict["Metadata_%d" % counter]["PART NAMEs"] = a
                        if see and table:
                            a.append(element.get_text().strip().split('\n'))
                            #new_dict["Metadata_%d" %counter]["SEE PAGE"]=a

    #close open handles
    converter.close()
    fake_file_handle.close()

    if new_dict:
        return new_dict
Example #36
0
def get_pdf_rows(data, miner_layout=True):
    """
    Takes PDF file content as string and yield table row data for each page.

    For each page in the PDF, the function yields a list of rows.
    Each row is a list of cells. Each cell is a list of strings present in the cell.
    Note that the rows may belong to different tables.

    There are no logic tables in PDF format, so this parses PDF drawing instructions
    and tries to find rectangles and arrange them in rows, then arrange text in
    the rectangles.

    External dependencies:
    PDFMiner (http://www.unixuser.org/~euske/python/pdfminer/index.html).
    """

    try:
        from pdfminer.pdfparser import PDFParser, PDFSyntaxError
    except ImportError:
        raise ImportError('Please install python-pdfminer')

    try:
        from pdfminer.pdfdocument import PDFDocument
        from pdfminer.pdfpage import PDFPage
        newapi = True
    except ImportError:
        from pdfminer.pdfparser import PDFDocument
        newapi = False
    from pdfminer.converter import PDFPageAggregator
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.layout import LAParams, LTRect, LTTextBox, LTTextLine, LTLine, LTChar, LTCurve

    parser = PDFParser(BytesIO(data))
    try:
        if newapi:
            doc = PDFDocument(parser)
        else:
            doc = PDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
    except PDFSyntaxError:
        return

    rsrcmgr = PDFResourceManager()
    if miner_layout:
        device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
    else:
        device = PDFPageAggregator(rsrcmgr)

    interpreter = PDFPageInterpreter(rsrcmgr, device)
    if newapi:
        pages = PDFPage.get_pages(BytesIO(data), check_extractable=True)
    else:
        doc.initialize()
        pages = doc.get_pages()

    if LOGGER.isEnabledFor(DEBUGFILES):
        import tempfile
        import PIL.Image as Image
        import PIL.ImageDraw as ImageDraw
        import random

        path = tempfile.mkdtemp(prefix='pdf')

    for npage, page in enumerate(pages):
        LOGGER.debug('processing page %s', npage)
        interpreter.process_page(page)
        page_layout = device.get_result()

        texts = sum([list(lttext_to_multilines(obj, page_layout)) for obj in page_layout._objs if isinstance(obj, (LTTextBox, LTTextLine, LTChar))], [])
        LOGGER.debug('found %d text objects', len(texts))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for t in texts:
                color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255))
                draw.rectangle((t.x0, t.y0, t.x1, t.y1), outline=color)
                draw.text((t.x0, t.y0), t.text.encode('utf-8'), color)
            fpath = '%s/1text-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        if not miner_layout:
            texts.sort(key=lambda t: (t.y0, t.x0))

        # TODO filter ltcurves that are not lines?
        # TODO convert rects to 4 lines?
        lines = [lt_to_coords(obj, page_layout) for obj in page_layout._objs if isinstance(obj, (LTRect, LTLine, LTCurve))]
        LOGGER.debug('found %d lines', len(lines))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for l in lines:
                color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255))
                draw.rectangle((l.x0, l.y0, l.x1, l.y1), outline=color)
            fpath = '%s/2lines-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        lines = list(uniq_lines(lines))
        LOGGER.debug('found %d unique lines', len(lines))

        rows = build_rows(lines)
        LOGGER.debug('built %d rows (%d boxes)', len(rows), sum(len(row) for row in rows))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for r in rows:
                for b in r:
                    color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255))
                    draw.rectangle((b.x0 + 1, b.y0 + 1, b.x1 - 1, b.y1 - 1), outline=color)
            fpath = '%s/3rows-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        textrows = arrange_texts_in_rows(rows, texts)
        LOGGER.debug('assigned %d strings', sum(sum(len(c) for c in r) for r in textrows))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for row, trow in zip(rows, textrows):
                for b, tlines in zip(row, trow):
                    color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255))
                    draw.rectangle((b.x0 + 1, b.y0 + 1, b.x1 - 1, b.y1 - 1), outline=color)
                    draw.text((b.x0 + 1, b.y0 + 1), '\n'.join(tlines).encode('utf-8'), color)
            fpath = '%s/4cells-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        yield textrows
    device.close()
Example #37
0
class StatementReader:
    """
    Reader Class of a pdf statement
    """
    def __init__(self, file: Union[str, Path]):
        self.start_balance = None
        self.transaction_list = []
        self.f = open(file, 'rb')
        resource_manager = PDFResourceManager()
        params = LAParams()
        self.device = PDFPageAggregator(resource_manager, laparams=params)
        self.interpreter = PDFPageInterpreter(resource_manager, self.device)

    def read_statement(self):
        """
        Read the pdf statement pages
        :return: statement pages
        """
        # Process each page contained in the statement.
        page_list = []
        for page in PDFPage.get_pages(self.f):
            str_list = self.read_page(page)

            page_list.append(str_list)
        return page_list

    def read_page(self, page: Iterator[PDFPage]):
        """
        Read a page from the statement
        :param page: statement page
        :return: strings of the page
        """
        characters = []
        self.interpreter.process_page(page)
        layout = self.device.get_result()
        for box in layout:
            if isinstance(box, LTTextBoxHorizontal):
                characters.extend(extract_characters(box))
        # Create list of characters
        char_list = [
            Char(char) for char in characters if isinstance(char, LTChar)
        ]
        char_list = sorted(char_list, key=lambda char: char.y0, reverse=True)
        # Attribute a row number to each character
        char_list[0].row = 0
        for i in range(1, len(char_list)):
            if (char_list[i - 1].y0 - char_list[i].y0) > CHAR_HEIGHT / 2:
                char_list[i].row = char_list[i - 1].row + 1
            else:
                char_list[i].row = char_list[i - 1].row
        char_list = sorted(char_list, key=lambda char: (char.row, char.x0))
        # Create list of strings
        str_list = []
        previous_row = char_list[0].row
        previous_col = char_list[0].col
        i = 1
        while i < len(char_list):
            current_row = char_list[i].row
            current_col = char_list[i].col
            current_col_name = char_list[i].col_name
            string = String(current_row, current_col, current_col_name)
            while True and i < len(char_list):
                if (char_list[i].row == previous_row
                        and char_list[i].col == previous_col):
                    if (char_list[i].x0 - char_list[i - 1].x1) > CHAR_WIDTH:
                        string.text = ' '.join(
                            (string.text, char_list[i].text))
                    else:
                        string.text = ''.join((string.text, char_list[i].text))
                else:
                    previous_row = char_list[i].row
                    previous_col = char_list[i].col
                    string.clean()
                    str_list.append(string)
                    break
                i = i + 1
        str_list = iter(sorted(str_list, key=lambda x: (x.row, x.col)))
        return str_list

    def get_statement_details(self):
        """
        Map the strings from the all the statement pages to attributes
        """
        page_list = self.read_statement()
        for str_list in page_list:
            self.get_transaction_details(str_list)

    def get_transaction_details(self, str_list: Iterator[String]):
        """
        Map the strings from the page to attributes
        :param str_list: string list of a page
        """
        while True:
            string = next(str_list, None)
            if string is None:
                break
            # First BALANCE BROUGHT FORWARD
            elif string.text == 'BALANCE BROUGHT FORWARD':
                string = next(str_list)
                if self.start_balance is None:
                    # Some time, there is a '.' in the first line so we pass it
                    if string.text == '.':
                        string = next(str_list)
                    self.start_balance = to_float(string.text)
                string = next(str_list)
                # Last BALANCE BROUGHT FORWARD
                while string.text != 'BALANCE CARRIED FORWARD':
                    current_row = string.row
                    new_transaction = False
                    date = None
                    method_symbol = None
                    entity = None
                    amount = 0
                    while string.row == current_row:
                        if string.text == 'BALANCE CARRIED FORWARD':
                            break
                        if string.col_name == 'date':
                            date = to_date_str(to_date(string.text))
                        elif string.col_name == 'payment_type':
                            method_symbol = string.text
                            new_transaction = True
                        elif string.col_name == 'entity':
                            entity = string.text
                        elif string.col_name == 'paid_out':
                            amount = amount - to_float(string.text)
                        elif string.col_name == 'paid_in':
                            amount = amount + to_float(string.text)
                        elif string.col_name == 'balance':
                            pass
                        else:
                            raise ValueError('col name not found')
                        string = next(str_list)
                    else:
                        if new_transaction:
                            if date is None:
                                prev_transaction = self.transaction_list[-1]
                                date = prev_transaction['date']
                            transaction = dict(date=date,
                                               method=METHOD[method_symbol],
                                               method_symbol=method_symbol,
                                               entity=entity,
                                               amount=amount,
                                               ccy=CCY,
                                               account=ACCOUNT)
                            self.transaction_list.append(transaction)
                        else:
                            prev_transaction = self.transaction_list[-1]
                            prev_transaction['amount'] = amount
                            prev_transaction['entity'] = ' '.join(
                                (prev_transaction['entity'], entity))
                            self.transaction_list[-1] = prev_transaction
                else:
                    break

    def close_statement(self):
        """
        Close the pdf statement
        """
        self.f.close()
        self.device.close()