def getPageLayouts(f1):
    '''Takes a pdf file object, f1, extracts the text-like objects, and returns'''
    try:
        '''The parser and doc pair for a "pipe" of sorts'''
        with open(fpath, 'rb') as f1:
            parser = PDFParser(f1)
            doc = PDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
            doc.initialize(pss_wd)

            # can we extract text?
            if doc.is_extractable:
                rsrcmgr = PDFResourceManager()
                laparams = LAParams()
                device = PDFPageAggregator(rsrcmgr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)

                page_layouts = []
                for page in doc.get_pages():
                    '''
                    I *think* we're actually calling on fp here, and not some
                    stored data; the idea is that .pdf files are "too big and
                    complicated" to load all at once, so why not just parse
                    what you need when you need it?
                    '''
                    interpreter.process_page(page)
                    # receive the LTPage object for the page
                    page_layouts.append(device.get_result())
    except IOError:
        raise IOError, "issue with loading file, please try again"
    finally:
        f1.close()
        return page_layouts
def readPdf(file):
    # Open a PDF file.
    fp = open(file, 'rb')

    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser)
    
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
        
    # Set parameters for analysis.
    laparams = LAParams(line_margin=0.1)
    
    pages = []

    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in islice(PDFPage.create_pages(document), 2):
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        pages.append(layout)
        
    return pages
Example #3
0
def convert_pdf_table(pdf_file):
    pdf_file = open(pdf_file, 'rb')
    parser = PDFParser(pdf_file)
    document = PDFDocument(parser)

    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    rsrcmgr = PDFResourceManager()

    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)

    interpreter = PDFPageInterpreter(rsrcmgr, device)

    table = []
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page) 
        layout = device.get_result()
        page_table = tabulate_page(layout)
        header = page_table[0]
        rows = page_table[1:]
        for row in rows:
            row_dict = {}
            for item, detail in enumerate(row):
                if detail != '':
                    row_dict[header[item].lower()] =  detail
            table.append(row_dict)           
                
    return table
Example #4
0
def parsepdf(filename):
    fp = open(filename, 'rb')
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    # Create a PDF device object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    found_randers = False
    found_aarhus = False
    _randers = []
    headings = [u'Ledige lejligheder\n',u'afd. adresse\n',u'rum m2\n',u'leje \n',
                u'a\xb4c varme a\xb4c vand\n',u'indskud\n',u'ledig pr.\n',u'bem\xe6rkning\n'
                ]
    location_map = OrderedDict()
    header_ycord = []
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        layout = device.get_result()

        for obj in layout._objs:
            # print obj
            if isinstance(obj,LTTextBoxHorizontal):
                for o in obj._objs:
                    y0 = o.y0
                    # print o
                    if isinstance(o,LTTextLineHorizontal) and obj.get_text() not in headings:

                        if y0 not in header_ycord:
                            if y0 in location_map :
                                objs = location_map.get(y0)
                            else:
                                objs = []
                            string_val = o.get_text().encode('ascii', 'ignore')
                            string_val = string_val.replace('\n','')
                            objs.append(string_val)
                            location_map.__setitem__(y0,objs)
                    else :
                        if y0 not in header_ycord:
                            header_ycord.append(y0)





    for key in location_map:
        print '**************************'
    #     # print key
        print location_map.get(key)
        print '**************************'
    print 'Total Rowss = %s'%len(location_map)
Example #5
0
def read_invoice_pdfminer3k(pdfFile):
    fp = open(os.path.join(invoice_path + "\\" + pdfFile), "rb")

    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)

    doc.initialize("")
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()

    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Process each page contained in the document.
    invoice_text = ""
    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                invoice_text += lt_obj.get_text()

    # Extract client info from the string extracted from pdf
    client = extract_info(invoice_text, client_start, client_end)
    print("client :" + client)

    # Extract invoice no from the pdf file name
    invoice_no = extract_info(str(pdfFile), invoice_start, invoice_end)
    print("invoice no :" + invoice_no)

    # Pass the client info and invoice no to the method which writes to excel file
    write_excel(client, invoice_no)
Example #6
0
def parse_pdf(fname):
    fp = open(fname, 'rb')
    # 来创建一个pdf文档分析器
    parser = PDFParser(fp)
    # 创建一个PDF文档对象存储文档结构
    document = PDFDocument(parser)
    # 检查文件是否允许文本提取
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建一个PDF资源管理器对象来存储共赏资源
        rsrcmgr=PDFResourceManager()
        # 设定参数进行分析
        laparams=LAParams()
        # 创建一个PDF设备对象
        # device=PDFDevice(rsrcmgr)
        device=PDFPageAggregator(rsrcmgr,laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter=PDFPageInterpreter(rsrcmgr,device)
        # 处理每一页

        contents = []
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout=device.get_result()
            for x in layout:
                if(isinstance(x, LTTextBoxHorizontal)):
                    content = x.get_text().strip()

                    # print type(content)
                    # print content
                    if content:
                        contents.append(content)
        return contents
Example #7
0
def extract_text_elements_from_pdf(path, j=nulljob):
    """Opens a PDF and extract every element that is text based (LTText).
    """
    fp = open(path, 'rb')
    doc = PDFDocument(caching=True)
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize()
    rsrcmgr = PDFResourceManager()
    laparams = LAParams(all_texts=True, paragraph_indent=5, heuristic_word_margin=True)
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = []
    all_elements = []
    enumerated_pages = list(enumerate(doc.get_pages()))
    progress_msg = "Reading page %i of %i"
    for pageno, page in j.iter_with_progress(enumerated_pages, progress_msg):
        interpreter.process_page(page)
        page_layout = device.get_result()
        pages.append(Page(page_layout.width, page_layout.height))
        textboxes = extract_textboxes(page_layout)
        elements = [create_element(box) for box in textboxes]
        merge_oneletter_elems(elements)
        for i, elem in enumerate(elements):
            elem.page = pageno
            elem.order = i
        all_elements += elements
    return pages, all_elements
Example #8
0
def Layout():
    # Set parameters for analysis.
    with open('/home/chris/Documents/Literature/Donghun_ACSNano_2014', 'rb') as fp:
        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)
        # Create a PDF document object that stores the document structure.
        # Supply the password for initialization.
        document = PDFDocument(parser)
        # Check if the document allows text extraction. If not, abort.
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()
        print rsrcmgr
           
        laparams = LAParams()
        # Create a PDF page aggregator object.
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            # receive the LTPage object for the page.
            layout = device.get_result()
            
        return layout
Example #9
0
def get_result_from_file(filename):
    from pdfminer.pdfparser import PDFParser
    from pdfminer.pdfdocument import PDFDocument
    from pdfminer.pdfpage import PDFPage
    from pdfminer.pdfpage import PDFTextExtractionNotAllowed
    from pdfminer.pdfinterp import PDFResourceManager
    from pdfminer.pdfinterp import PDFPageInterpreter
    from pdfminer.converter import PDFPageAggregator
    from pdfminer.layout import LAParams

    result = {"filename": filename, "pages": []}
    fp = open(filename, "rb")
    parser = PDFParser(fp)
    document = PDFDocument(parser)
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    laparams.char_margin = 2.0
    laparams.detect_vertical = True
    laparams.line_margin = 1.0
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    page_index = 0
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        layout = device.get_result()
        bounding_box = get_bounding_box(layout)
        labels = get_text_labels(layout)
        result["pages"].append({"index": page_index, "bounding_box": bounding_box, "labels": labels})
        page_index += 1
    fp.close()
    return result
Example #10
0
def pdf2txt(data,save_path):

    parser = PDFParser(data)

    document = PDFDocument(parser)

    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        #
        rsrcmgr = PDFResourceManager()

        laparams = LAParams()

        device = PDFPageAggregator(rsrcmgr,laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr,device)
        #
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
            for line in layout:
                try:
                    if(isinstance(line,LTTextBoxHorizontal)):
                        with open('%s'%(save_path),'a') as f:
                            f.write(line.get_text().encode('utf-8') + '\n')
                except:
                    print "failed!"
Example #11
0
def get_num(source_file):
    fp = open(source_file,'rb')
    # fp = StringIO(source_file)
    #创建一个PDF文档解析器对象
    parser = PDFParser(fp)
    #创建一个PDF文档对象存储文档结构
    #提供密码初始化,没有就不用传该参数
    document = PDFDocument(parser)
    #检查文件是否允许文本提取
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    #创建一个PDF资源管理器对象来存储共享资源
    rsrcmgr = PDFResourceManager()
    #创建一个pdf设备对象
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    #创建一个PDF解析器对象
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    #处理文档当中的每个页面
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        layout = device.get_result()

        for n,l in enumerate(layout):
            if isinstance(l,LTTextBox):
                text = l.get_text()

                if n == 0:
                    pass

                elif n == 1:
                    num = text.split(":")[1].replace("\n",'')
                    return num
                else:
                    break
Example #12
0
def extract_layout_by_page(pdf_path):
    """
    See:
    - https://euske.github.io/pdfminer/programming.html
    - http://denis.papathanasiou.org/posts/2010.08.04.post.html
    """
    laparams = LAParams()

    fp = open(pdf_path, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser)

    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    rsrcmgr = PDFResourceManager()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    layouts = []
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        layouts.append(device.get_result())

    return layouts
Example #13
0
def pdf2text(path,save_file):
    '''
    读取本地pdf文档,并保存到txt文件中
    :param path: 源pdf 文件
    :param save_file:  保存txt文件名,无路径则保存到脚本同一目录
    :return: 无
    '''

    #创建分析器
    parser = PDFParser(path)
    #文档存储结构
    document = PDFDocument(parser)

    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager()

        laparams = LAParams()

        device = PDFPageAggregator(rsrcmgr,laparams = laparams)

        interpreter = PDFPageInterpreter(rsrcmgr,device)
        #处理每一页
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)

            layout = device.get_result()

            for line in layout:
                if(isinstance(line,LTTextBoxHorizontal)):
                    with open('%s' %(save_file),'a') as f:
                        f.write(line.get_text().encode('utf-8'))
def setup(path):
	# Open a PDF file.
	fp = open(path, 'rb')
	# Create a PDF parser object associated with the file object.
	parser = PDFParser(fp)
	# Create a PDF document object that stores the document structure.
	# Supply the password for initialization.
	document = PDFDocument(parser)
	# Check if the document allows text extraction. If not, abort.
	if not document.is_extractable:
	    raise PDFTextExtractionNotAllowed
	# Create a PDF device object.
	device = PDFPageAggregator(rsrcmgr, laparams=laparams)
	# Create a PDF interpreter object.
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	# Process each page contained in the document.

	# now extract dialogue from 
	for i, page in enumerate(PDFPage.create_pages(document)):
		# skip the title page
		if i > 0:
			# process page with interpreter
			interpreter.process_page(page)
			# get layout info
			layout = device.get_result()
			# iterate through layout objects
			for obj in layout:
				# we only want to bother with LTTextBox and LTTextLine
				if isinstance(obj, LTTextBox) or isinstance(obj, LTTextLine):
					# only extract text segments within a certain margin range
					if obj.bbox[0] > DIALOGUE_BBOX_MIN and obj.bbox[0] < DIALOGUE_BBOX_MAX:
						# need to convert unicode characters
						converted = unicodedata.normalize('NFKD', obj.get_text()).encode('ascii', 'ignore')
						print(converted)
Example #15
0
    def _GetFromPdf(self,pdf):
        '''
        参考文档http://www.unixuser.org/~euske/python/pdfminer/programming.html
        '''
        pass
        fp = open(pdf, 'rb')
        #用文件对象来创建一个pdf文档分析器
        parser = PDFParser(fp)
        # 创建一个  PDF 文档
        doc = PDFDocument(parser)
        # 连接分析器 与文档对象
        parser.set_document(doc)
        # 检测文档是否提供txt转换,不提供就忽略
        if not doc.is_extractable:
            raise PDFTextExtractionNotAllowed

        # 创建PDf 资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
            # receive the LTPage object for the page.
            layout = device.get_result()
            for x in layout:
                if(isinstance(x, LTTextContainer)):
                    print x.get_text()

        pass
Example #16
0
    def generateFileContent(self):

        import tempfile
        import urllib

        abbreviationsPdfUrl = u"http://www.realacademiagalega.org/c/document_library/get_file?uuid=f29e6ce1-9ac5-42e3-8c15-73c4b9b5f48b&groupId=10157"
        temporaryFile = tempfile.NamedTemporaryFile()
        urllib.urlretrieve(abbreviationsPdfUrl, temporaryFile.name)

        entries = set()
        fileObject = open(temporaryFile.name, "rb")
        parser = PDFParser(fileObject)
        document = PDFDocument(parser)
        resourceManager = PDFResourceManager()
        device = PDFPageAggregator(resourceManager)
        interpreter = PDFPageInterpreter(resourceManager, device)
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
            objects = [object for object in layout if not isinstance(object, LTRect) and not isinstance(object, LTCurve)]
            params = LAParams()
            for line in layout.group_objects(params, objects):
                text = line.get_text()
                if u":" in text:
                    entry = text.split(u":")[0]
                    entry = entry.strip()
                    entry = entry.replace(u"..", ".")
                    entries.add(entry)

        dictionary  = u"# Abreviaturas empregadas no Dicionario da Real Academia Galega\n"
        dictionary += u"# http://www.realacademiagalega.org/abreviaturas\n"
        dictionary += u"\n"
        for entry in formatEntriesForDictionary(entries, u"abreviatura"):
            dictionary += entry
        return dictionary
def get_layout(url, pages=None):
    """
    The layout is an object of pdfminer corresponding to the tree structure of
    a pdf. More information about the layout here:
    http://www.unixuser.org/~euske/python/pdfminer/programming.html
    :param url: path (str) of the pdf file to be analysed
    :param pages: list (int) of pages of which you want the layout.
    Beware
    that
    the first page of the pdf correspond to number 0, even if its id is 1
    :return layouts: List of layouts (One layout per page).
    """
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    # Set parameters for analysis.
    laparams = LAParams()
    manager = PDFResourceManager()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(manager, laparams=laparams)
    interpreter = PDFPageInterpreter(manager, device)
    layouts = []
    with open(url, 'rb') as infile:
        for page in PDFPage.get_pages(infile, pagenos=pagenums):
            interpreter.process_page(page)
            layouts.append(device.get_result())
    device.close()

    return layouts
Example #18
0
def parse_pages(pdf_buffer, password):
    """
    With an PDF buffer object, get the pages, parse each one, and return the entire pdf text
    """
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(pdf_buffer)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser, password)

    resource_manager = PDFResourceManager()
    la_params = LAParams()
    device = PDFPageAggregator(resource_manager, laparams=la_params)
    interpreter = PDFPageInterpreter(resource_manager, device)

    text_content = []  # a list of strings, each representing text collected from each page of the doc
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        # receive the LTPage object for this page
        layout = device.get_result()
        # layout is an LTPage object which may contain
        #  child objects like LTTextBox, LTFigure, LTImage, etc.
        text_content.append(parse_lt_objects(layout._objs))  # pylint: disable=protected-access

    return text_content
def getTemPdf(file):
        parser = PDFParser(file)
        document = PDFDocument(parser)
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        else:
            f = open('result\\' + 'tem_pdf', 'w')
            f.write(''.encode('utf-8'))
            f.close()
            rsrcmgr = PDFResourceManager()
            laparams = LAParams()
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.create_pages(document):
                interpreter.process_page(page)
                layout = device.get_result()
                for x in layout:
                    if isinstance(x, LTTextBoxHorizontal):
                        with open('result\\' + 'tem_pdf', 'a') as f:
                            sentence = x.get_text()
                            f.write(sentence.encode('utf-8') + '\n')
                            f.close()

            return_tem_pdf = open('result\\' + 'tem_pdf', 'rb')
            return return_tem_pdf
Example #20
0
def extract_pdf(path, languages=None):
    """
    Extract content from a PDF file.

    This will attempt to use pdfminer to extract textual content from
    each page. If none is found, it'll send the images through OCR.
    """
    fh = open(path, "rb")
    result = {"pages": []}
    try:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        parser = PDFParser(fh)
        doc = PDFDocument(parser, "")

        if len(doc.info):
            for k, v in doc.info[-1].items():
                k = k.lower().strip()
                v = string_value(v)
                if k != "pages" and v is not None and "<PDFObjRef:" not in v:
                    result[k] = string_value(v)

        for i, page in enumerate(PDFPage.create_pages(doc)):
            result["pages"].append(_convert_page(interpreter, page, device, i + 1, path, languages))
        device.close()
        return result
    except PSEOF as eof:
        log.info("Unexpected EOF: %r", eof)
        return result
    finally:
        fh.close()
Example #21
0
def convertWithCoordinatesPara(fname, pages=None):
  fontSize = {}
  pdfText = []

  print fname
  if not pages:
    pagenums = set()
  else:
    pagenums = set(pages)

  infile = file(fname, 'rb')

  parser = PDFParser(infile)
  document = PDFDocument(parser)

  laparams = LAParams()

  manager = PDFResourceManager()
  device = PDFPageAggregator(manager, laparams=laparams)

  interpreter = PDFPageInterpreter(manager, device)

  for page in PDFPage.create_pages(document):
    interpreter.process_page(page)
    layout = device.get_result()
    
    parse_obj_para(layout._objs, fontSize, pdfText)

  return {'fontSize': fontSize, 'pdfText': pdfText}
Example #22
0
def extract_text(doc, config):
    rsrcmanager = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmanager, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmanager, device)

    pages = doc.get_pages()

    if config.page is not None:
        # take only 1 page
        # note: Use page-1 because of 0 index (where pdfs index from 1)
        pages = [next(itertools.islice(pages, config.page - 1, None), None)]

    for page in pages:
        interpreter.process_page(page)
        layout = device.get_result()

        text = []
        for obj in layout:
            if isinstance(obj, LTTextBox):
                for line in obj:
                    # coord = ((line.x0, line.y0), (line.x1, line.y1))
                    text.append(line)
            elif isinstance(obj, LTTextLine):
                assert False, "Expected no lines at top of tree"
            else:
                pass

        yield text
def pdf_to_text(page_object):
    parser = PDFParser(page_object)
    # Create a PDF document object that stores the document structure
    doc = PDFDocument(parser)
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.initialize('')
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF page aggregator object
    device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    text_content = []
    # i = page number #without this it doesn't work
    # page are items in page
    for i, page in enumerate(PDFPage.create_pages(doc)):
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        for object in layout:
            if isinstance(object, LTTextBox) or isinstance(object, LTTextLine):
                trial = []
                trial.append(object.get_text())
                for word in trial:
                    text_content.append(word)                    
    return text_content
Example #24
0
 def __init__(self, rsrcmgr, pageno=1, laparams=None):
     PDFPageAggregator.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
     self.rows = []
     self.page_number = 0
     self.outline = False # not an outline page
     self.interesting_text = [] # filled only if there is any outline info
     self.aux_text = [] # possibly helpful info, but maybe mixed
Example #25
0
def extract_pdf(path, languages=None):
    """ Extract content from a PDF file. This will attempt to use PyPDF2
    to extract textual content first. If none is found, it'll send the file
    through OCR. """
    with open(path, 'rb') as fh:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        parser = PDFParser(fh)
        doc = PDFDocument(parser, '')
        result = {'pages': []}
        if len(doc.info):
            for k, v in doc.info[-1].items():
                k = k.lower().strip()
                if k != 'pages':
                    result[k] = safe_text(v)

        if not doc.is_extractable:
            log.warning("PDF not extractable: %s", path)
            return result

        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
            layout = device.get_result()
            text = _convert_page(layout, languages)
            result['pages'].append(text)
        device.close()
        return result
Example #26
0
def get_layout(path):
	'''returns a list of every character in the document as well as its location'''

	rsrcmgr = PDFResourceManager()
	retstr = StringIO()
	codec = 'utf-8'
	laparams = LAParams()

	fp = file(path, 'rb')
	password = ""
	maxpages = 0
	caching = True
	pagenos=set()

	layout = []
	device = PDFPageAggregator(rsrcmgr, laparams=laparams)
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
		interpreter.process_page(page)
		layout.append(  device.get_result()  )
	fp.close()
	device.close()
	retstr.close()

	return layout
Example #27
0
def parsing(pdfPath, pdfFileName):
    fp = open(pdfPath + '\\' + pdfFileName, 'rb')
    parser      = PDFParser(fp)
    document    = PDFDocument(parser)
    rsrcmgr     = PDFResourceManager()
    laparams    = LAParams()
    device      = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pathOut     = r'C:\Projects\PDFparser\pageLayout'
    layoutName  = pdfFileName.split('.', 1)[0].replace(' ','_')
    
    # Create a folder for each pdf file layout
    if not os.path.exists(layoutName):
        os.makedirs(pathOut + '\\' + layoutName)
    for pageNum, page in enumerate(PDFPage.create_pages(document)):
        interpreter.process_page(page)
        layout = device.get_result()
        parse_layout(layout)

        # .pmlo stands for PDFminer Layout
        fileOut = open(pathOut + '\\' + layoutName + '\\' + str(pageNum + 1) + '.pmlo','w')
        
        for line in layoutStream:
            fileOut.write(str(line))

        #Start a new page
        del layoutStream[:]

    fp.close()  
    def Parse(self):
        # 先看是否有 cache,以及日期是否夠新
        if not os.path.exists(parseCacheDir):
            os.makedirs(parseCacheDir)
        cacheFile = os.path.join(parseCacheDir, os.path.basename(self.pdfFileName) + '.cache')
        foundCache = (os.path.isfile(cacheFile) and \
                      os.path.getsize(cacheFile) > 0 and \
                      os.path.getmtime(cacheFile) > os.path.getmtime(self.pdfFileName))
        if (foundCache):
            fp = open(cacheFile, 'rb')
            self.RawData = pickle.load(fp)
            fp.close()
        else:
            fp = open(self.pdfFileName, 'rb')
            for page in PDFPage.get_pages(fp, None, maxpages=1):
                rsrcmgr = PDFResourceManager()
                device = PDFPageAggregator(rsrcmgr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                interpreter.process_page(page)
                layout = device.get_result()
                self.__readobj(layout._objs)
                for category in self.RawData.values():
                    self.__reverseYaxis(category, layout.bbox[3])
                cacheFp = open(cacheFile, 'wb')
                pickle.dump(self.RawData, cacheFp)
                cacheFp.close()
            fp.close()

        self.__calculateBoundary()
        self.__assignCharsAndLinesToCell()
        self.__processCells()
        return (self.effectiveFrom, self.__getResult())
Example #29
0
def parse_pdf(pdf_url):

    remote_file = urllib.request.urlopen(pdf_url).read()
    memory_file = io.BytesIO(remote_file)
    parser = PDFParser(memory_file)
    doc = PDFDocument()
    parser.set_document(doc)
    #Warning sometimes, error in pdf?
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    ret = []
    # Process each page contained in the document.
    for pageIdx, page in enumerate(doc.get_pages()):
        ret.append([])
        interpreter.process_page(page)
        layout = device.get_result()
        for idx, lt_obj in enumerate(layout):
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                if len(lt_obj.get_text().strip()) > 0:
                    ret[pageIdx].append((lt_obj.get_text().splitlines()))
    return ret
Example #30
0
def pdf_to_txt(in_file):
	""" turn a PDF file to a TXT file (roughly processed)
	"""
	# Open a PDF file.
	fp = open(in_file, 'rb')
	# Create a PDF parser object associated with the file object.
	parser = PDFParser(fp)
	# Create a PDF document object that stores the document structure.
	document = PDFDocument(parser)
	# Check if the document allows text extraction. If not, abort.
	if not document.is_extractable:
		raise PDFTextExtractionNotAllowed
	# Set parameters for analysis.
	laparams = LAParams()
	# Create a PDF resource manager object that stores shared resources.
	rsrcmgr = PDFResourceManager()
	# Create a PDF page aggregator object.
	device = PDFPageAggregator(rsrcmgr, laparams=laparams)
	# Create a PDF interpreter object.
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	for page in PDFPage.create_pages(document):
		interpreter.process_page(page)
		# Receive the LTPage object for the page.
		layout = device.get_result()
		for klass in layout:
			if isinstance(klass, LTTextBoxHorizontal):
				out_file = in_file[:-3] + 'txt'
				with open(out_file, 'a') as dst_file:
					text = klass.get_text().encode('utf-8')
					dst_file.write(text + '\n')
	return None
Example #31
0
def get_pdf_rows(data, miner_layout=True):
    """
    Takes PDF file content as string and yield table row data for each page.

    For each page in the PDF, the function yields a list of rows.
    Each row is a list of cells. Each cell is a list of strings present in the cell.
    Note that the rows may belong to different tables.

    There are no logic tables in PDF format, so this parses PDF drawing instructions
    and tries to find rectangles and arrange them in rows, then arrange text in
    the rectangles.

    External dependencies:
    PDFMiner (http://www.unixuser.org/~euske/python/pdfminer/index.html).
    """

    try:
        from pdfminer.pdfparser import PDFParser, PDFSyntaxError
    except ImportError:
        raise ImportError('Please install python-pdfminer')

    try:
        from pdfminer.pdfdocument import PDFDocument
        from pdfminer.pdfpage import PDFPage
        newapi = True
    except ImportError:
        from pdfminer.pdfparser import PDFDocument
        newapi = False
    from pdfminer.converter import PDFPageAggregator
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.layout import LAParams, LTRect, LTTextBox, LTTextLine, LTLine, LTChar, LTCurve

    parser = PDFParser(BytesIO(data))
    try:
        if newapi:
            doc = PDFDocument(parser)
        else:
            doc = PDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
    except PDFSyntaxError:
        return

    rsrcmgr = PDFResourceManager()
    if miner_layout:
        device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
    else:
        device = PDFPageAggregator(rsrcmgr)

    interpreter = PDFPageInterpreter(rsrcmgr, device)
    if newapi:
        pages = PDFPage.get_pages(BytesIO(data), check_extractable=True)
    else:
        doc.initialize()
        pages = doc.get_pages()

    if LOGGER.isEnabledFor(DEBUGFILES):
        import tempfile
        import PIL.Image as Image
        import PIL.ImageDraw as ImageDraw
        import random

        path = tempfile.mkdtemp(prefix='pdf')

    for npage, page in enumerate(pages):
        LOGGER.debug('processing page %s', npage)
        interpreter.process_page(page)
        page_layout = device.get_result()

        texts = sum([list(lttext_to_multilines(obj, page_layout)) for obj in page_layout._objs if isinstance(obj, (LTTextBox, LTTextLine, LTChar))], [])
        LOGGER.debug('found %d text objects', len(texts))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for t in texts:
                color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255))
                draw.rectangle((t.x0, t.y0, t.x1, t.y1), outline=color)
                draw.text((t.x0, t.y0), t.text.encode('utf-8'), color)
            fpath = '%s/1text-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        if not miner_layout:
            texts.sort(key=lambda t: (t.y0, t.x0))

        # TODO filter ltcurves that are not lines?
        # TODO convert rects to 4 lines?
        lines = [lt_to_coords(obj, page_layout) for obj in page_layout._objs if isinstance(obj, (LTRect, LTLine, LTCurve))]
        LOGGER.debug('found %d lines', len(lines))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for l in lines:
                color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255))
                draw.rectangle((l.x0, l.y0, l.x1, l.y1), outline=color)
            fpath = '%s/2lines-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        lines = list(uniq_lines(lines))
        LOGGER.debug('found %d unique lines', len(lines))

        rows = build_rows(lines)
        LOGGER.debug('built %d rows (%d boxes)', len(rows), sum(len(row) for row in rows))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for r in rows:
                for b in r:
                    color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255))
                    draw.rectangle((b.x0 + 1, b.y0 + 1, b.x1 - 1, b.y1 - 1), outline=color)
            fpath = '%s/3rows-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        textrows = arrange_texts_in_rows(rows, texts)
        LOGGER.debug('assigned %d strings', sum(sum(len(c) for c in r) for r in textrows))
        if LOGGER.isEnabledFor(DEBUGFILES):
            img = Image.new('RGB', (int(page.mediabox[2]), int(page.mediabox[3])), (255, 255, 255))
            draw = ImageDraw.Draw(img)
            for row, trow in zip(rows, textrows):
                for b, tlines in zip(row, trow):
                    color = (random.randint(127, 255), random.randint(127, 255), random.randint(127, 255))
                    draw.rectangle((b.x0 + 1, b.y0 + 1, b.x1 - 1, b.y1 - 1), outline=color)
                    draw.text((b.x0 + 1, b.y0 + 1), '\n'.join(tlines).encode('utf-8'), color)
            fpath = '%s/4cells-%03d.png' % (path, npage)
            img.save(fpath)
            LOGGER.log(DEBUGFILES, 'saved %r', fpath)

        yield textrows
    device.close()
Example #32
0
#创建一个与文档相关联的解释器
parser = PDFParser(fp)
#PDF文档对象
doc = PDFDocument(parser)
#链接解释器和文档对象
parser.set_document(doc)
doc.set_parser(parser)
#初始化文档
doc.initialize("")
#创建PDF资源管理器
resource = PDFResourceManager()
#参数分析器
laparam = LAParams()
#创建一个聚合器
device = PDFPageAggregator(resource, laparams=laparam)
#创建PDF页面解释器
interpreter = PDFPageInterpreter(resource, device)

#新建一个文档来写入数据
grades = open('grades.txt', 'w')
for page in doc.get_pages():
    #使用页面解释器来读取
    interpreter.process_page(page)
    #使用聚合器来获得内容
    layout = device.get_result()
    for out in layout:
        if hasattr(
                out, 'get_text'
        ):  # 需要注意的是在PDF文档中不只有 text 还可能有图片等等,为了确保不出错先判断对象是否具有 get_text()方法
            grades.write(out.get_text())
Example #33
0
# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
    raise PDFTextExtractionNotAllowed

# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()

# Create a PDF device object.
device = PDFDevice(rsrcmgr)

# BEGIN LAYOUT ANALYSIS
# Set parameters for analysis.
laparams = LAParams()

# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)

# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)


def parse_obj(lt_objs):

    # loop over the object list
    for obj in lt_objs:

        # if it's a textbox, print text and location
        if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
            extracted_text = obj.get_text().replace('\n', '')
            coordinates = dict()
            coordinates['x1'] = obj.bbox[0]
Example #34
0
def read_pdf_text(filename):
    """
    use pdfminer to get the valid area of each page.
    all results are relative position!
    """
    # 打开一个pdf文件
    with open(filename, 'rb') as fp:
        # 创建一个PDF文档解析器对象
        parser = PDFParser(fp)
        # 创建一个PDF文档对象存储文档结构
        # 提供密码初始化,没有就不用传该参数
        #document = PDFDocument(parser, password)
        document = PDFDocument(parser)
        # 检查文件是否允许文本提取
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        # 创建一个PDF资源管理器对象来存储共享资源
        # caching = False不缓存
        rsrcmgr = PDFResourceManager(caching=False)
        # 创建一个PDF设备对象
        laparams = LAParams()
        # 创建一个PDF页面聚合对象
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解析器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # 处理文档当中的每个页面

        pageboxlist = []
        text_list = []

        # doc.get_pages() 获取page列表
        # for i, page in enumerate(document.get_pages()):
        # PDFPage.create_pages(document) 获取page列表的另一种方式
        # 循环遍历列表,每次处理一个page的内容
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象
            # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等
            for item in layout:
                if isinstance(item, LTTextBox) or isinstance(item, LTTextLine):

                    # 如果x是水平文本对象的话
                    if (isinstance(item, LTTextBoxHorizontal)):
                        text = item.get_text()
                    else:
                        text = item.get_text()

                    text_list.append(text)
                    text = text.encode('utf-8')
                    # print('text:{}'.format(text))
                # elif isinstance(item, LTImage):
                # 	print('image:{}'.format(item))
                # elif isinstance(item, LTFigure):
                #     print('figure:{}'.format(item))
                # elif isinstance(item, LTAnno):
                #     print('anno:{}'.format(item))
                # elif isinstance(item, LTChar):
                #     print('char:{}'.format(item))
                # elif isinstance(item, LTLine):
                #     print('line:{}'.format(item))
                # elif isinstance(item, LTRect):
                #     print('rect:{}'.format(item))
                # elif isinstance(item, LTCurve):
                #     print('curve:{}'.format(item))

    return text_list
        for child in layout_obj:
            boxes.extend(find_textboxes_recursively(child))

        return boxes

    return []  # その他の場合は空リストを返す。


# Layout Analysisのパラメーターを設定。縦書きの検出を有効にする。
laparams = LAParams(detect_vertical=True)

# 共有のリソースを管理するリソースマネージャーを作成。
resource_manager = PDFResourceManager()

# ページを集めるPageAggregatorオブジェクトを作成。
device = PDFPageAggregator(resource_manager, laparams=laparams)

# Interpreterオブジェクトを作成。
interpreter = PDFPageInterpreter(resource_manager, device)

# 出力用のテキストファイル
# output_txt = open('output.txt', 'w')


def print_and_write(txt):
    print(txt)
    # output_txt.write(txt)
    # output_txt.write('\n')


with open(sys.argv[1], 'rb') as f:
Example #36
0
    def _extract_qp(self, file_name):
        mark_sum = 0
        print "Extracting QP"

        # Load pdf
        laparams = LAParams()
        rsrcmgr = PDFResourceManager()
        document = file(file_name, 'rb')
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        q_num = 0
        for i, page in enumerate(PDFPage.get_pages(document)):
            # Get page layout
            interpreter.process_page(page)
            layout = device.get_result()

            # Extract metadata
            textboxes = [
                r for r in layout._objs if type(r) is LTTextBoxHorizontal
            ]
            work_out_y = 0
            answer_header = 0
            marks = []

            for t in textboxes:
                text = t.get_text()
                if "Answer space for question" in text:
                    work_out_y = int(t.y0)
                elif "marks]" in text:
                    marks.extend(find_between(text, "[", " marks]"))
                elif "mark]" in text:
                    marks.extend(find_between(text, "[", " mark]"))
                elif "......" in text:
                    # TODO: Find the correct amount of dots
                    pass
                elif text in ["QUESTION\n", "PART\n", "REFERENCE\n"]:
                    pass
                elif text in [
                        "Do not write\noutside the\n", "box\n", "Turn over s\n"
                ]:
                    pass
                elif text == "Answer all questions.\n":
                    answer_header = 74
                else:
                    pass
                    # print repr(text)

            marks = [int(m) for m in marks]
            mark_sum += sum(marks)

            # Comver page into image
            img_path = "{}[{}]".format(file_name, i)
            img = Image(filename=img_path, resolution=int(72 * Paper.QUALITY))

            # Set crop positions
            x = 46 * Paper.QUALITY
            y = (66 + answer_header) * Paper.QUALITY
            width = 489 * Paper.QUALITY
            height = (761 - answer_header - work_out_y) * Paper.QUALITY

            # Check for blank pages
            if height <= Paper.QUALITY or work_out_y <= 0:
                continue

            # Crop and save the image
            q_num += 1
            img.crop(x, y, width=width, height=height)
            img_path = os.path.join(self._folder, "q{}.jpg".format(q_num))
            img.save(filename=img_path)

            # Add question to questions
            self._questions.append(Question(img_path, q_num, marks))

        print "Marks: {}".format(mark_sum)
Example #37
0
def parse_pdf(path=None,
              data=None,
              savePath=None,
              y_tolerance=1.5,
              char_tolerance=0.5):
    '''
    function : 处理pdf
    :param:词间最大间距,行间最大间距,输入路径,输出路径
    :return  无
    '''
    # 记录page行数
    pdfRowNumber = 0

    theMaxColSize = []

    wb = Workbook()
    ws = wb.active

    if data == None:
        data = open(path, 'rb')

    parser = PDFParser(data)
    document = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    device = PDFPageAggregator(rsrcmgr, laparams=None)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        height = page.mediabox[3] - page.mediabox[1]
        layout = device.get_result()
        pageContainer, theMaxColNum = get_line_word(
            layout,
            height,
            y_tolerance=y_tolerance,
            char_tolerance=char_tolerance)
        # 按照位置信息排序
        for line in pageContainer:
            line.sort(key=itemgetter('x0'))
        pageContainer.sort(key=lambda line: line[0]['top'])

        if len(pageContainer[0]) < theMaxColNum:
            for i in range(len(pageContainer)):
                if len(pageContainer[i]) == theMaxColNum:
                    repairList = align_front_row(pageContainer[0:i],
                                                 theMaxColNum)
                    del pageContainer[0:i]
                    pageContainer.insert(0, repairList)
                    break
        # 对最后一排进行判断
        if len(pageContainer[-1]) < theMaxColNum:
            pageContainer[-1] = align_last_row(pageContainer[-2:],
                                               theMaxColNum)
        # 写入excel
        alignment = Alignment(horizontal='center', vertical='center')
        for idx, line in enumerate(pageContainer):
            for idy, item in enumerate(line):
                cellIndex = ws.cell(row=idx + 1 + pdfRowNumber, column=idy + 1)
                if item['text'] == '':
                    pass
                elif item['text'] == None:
                    ws.merge_cells(start_row=idx + 1 + pdfRowNumber,
                                   start_column=1,
                                   end_row=idx + 1 + pdfRowNumber,
                                   end_column=theMaxColNum)
                    ws.cell(idx + 1 + pdfRowNumber, 1).alignment = alignment
                    break
                else:
                    if idx == 0 and len(line) == 2:
                        pass
                    else:
                        cellIndex.alignment = alignment

                    if item['text'].isdigit():
                        cellIndex.value = int(item['text'])
                        cellIndex.number_format = '0'
                    elif is_float(item['text']):
                        cellIndex.value = float(item['text'])
                    else:
                        cellIndex.value = item['text']

        thePageMaxColSize = [0 for i in range(theMaxColNum)]
        for line in pageContainer:
            if len(line) == 2:
                continue
            for col, item in enumerate(line):
                if len(item['text']) > thePageMaxColSize[col]:
                    thePageMaxColSize[col] = len(item['text'])

        if theMaxColSize == []:
            theMaxColSize = thePageMaxColSize[:]
        else:
            for i in range(theMaxColNum):
                if theMaxColSize[i] < thePageMaxColSize[i]:
                    theMaxColSize[i] = thePageMaxColSize[i]
        # 将该页的行数相加,使excel连续
        pdfRowNumber += len(pageContainer)

    # 保存excel文件至本地
    letter = [
        'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
        'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'
    ]
    for col, theSize in enumerate(theMaxColSize):
        rest = (col + 1) % 26
        cut = int((col + 1) / 26)
        colLetter = ''
        if cut == 0:
            colLetter = letter[rest - 1]
        else:
            colLetter = letter[cut] + letter[rest - 1]
        ws.column_dimensions[colLetter].width = theSize * 2

    if savePath != None:
        wb.save(savePath)
    else:
        wb.save(path.replace('.pdf', '.xlsx'))
Example #38
0
def createDeviceInterpreter():
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    return device, interpreter
Example #39
0
def parse(_path):
    # fp = open(_path, 'rb')  # rb以二进制读模式打开本地pdf文件
    request = Request(url=_path,
                      headers={'User-Agent': random.choice(user_agent)
                               })  # 随机从user_agent列表中抽取一个元素
    fp = urlopen(request)  # 打开在线PDF文档

    # 用文件对象来创建一个pdf文档分析器
    praser_pdf = PDFParser(fp)

    # 创建一个PDF文档
    doc = PDFDocument()

    # 连接分析器 与文档对象
    praser_pdf.set_document(doc)
    doc.set_parser(praser_pdf)

    # 提供初始化密码doc.initialize("123456")
    # 如果没有密码 就创建一个空的字符串
    doc.initialize()

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()

        # 创建一个PDF参数分析器
        laparams = LAParams()

        # 创建聚合器
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)

        # 创建一个PDF页面解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # 循环遍历列表,每次处理一页的内容
        # doc.get_pages() 获取page列表
        pa = 0
        pdf_news = ''
        for page in doc.get_pages():
            # 使用页面解释器来读取
            interpreter.process_page(page)

            # 使用聚合器获取内容
            layout = device.get_result()
            i = 0
            # pdf_news = ''
            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,

            for out in layout:
                # 判断是否含有get_text()方法,图片之类的就没有
                # if hasattr(out,"get_text"):
                i += 1
                # if i <= 15:  # 只取pdf的前五行
                if isinstance(out, LTTextBoxHorizontal):
                    results = out.get_text()
                    pdf_news += results
                    # print(results)
            # 如果存在多页则取前三页
            pa += 1
            if pa >= 4:
                # print(pdf_news)
                # return pdf_news
                break  # 只取文档的第一页
        # print(pdf_news)
        return pdf_news
Example #40
0
import calendar

f = open("hrefs.txt", "r", encoding="utf-8")
hrefs = eval(f.read())
f.close()
f = open("DOIs.txt", "r", encoding="utf-8")
DOIs = eval(f.read())
f.close()
f = open("month_year.txt", "r", encoding="utf-8")
month_year = eval(f.read())
f.close()

# show warning
logging.propagate = False
logging.getLogger().setLevel(logging.ERROR)
device = PDFPageAggregator(PDFResourceManager(), laparams=LAParams())
interpreter = PDFPageInterpreter(PDFResourceManager(), device)

month_year_no = []
DOIs_no = []
texts = []
for i in range(len(hrefs)):
    for j in range(len(DOIs[i])):
        name = month_year[i].split(" ")[0] + "-" + month_year[i].split(
            " ")[1] + "-" + DOIs[i][j].replace("/", "")
        file_path = "C:/Users/Administrator/Desktop/AER/extract/" + month_year[
            i].split(" ")[0] + "-" + month_year[i].split(
                " ")[1] + "/" + name + ".pdf"
        if os.path.exists(file_path) and isValidPDF_pathfile(file_path):
            try:
                doc = PDFDocument()
Example #41
0
import os
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter

# Uma alternativa é o Py2PDF, mas o encoding do módulo não funciona das melhores formas
# TODO : jogar toda essa rotina para a tools
dirname = os.path.split(__file__)[0]
file = open("{}/../static/20180808CAPDJETJRJ_1.pdf".format(dirname), 'rb')

laparams = LAParams()
pdfrm = PDFResourceManager()
parser = PDFParser(file)
document = PDFDocument(parser)
device = PDFPageAggregator(pdfrm, laparams=laparams)
interpreter = PDFPageInterpreter(pdfrm, device)
for page in PDFPage.create_pages(document):
    interpreter.process_page(page)
    layout = device.get_result()

for i in layout:
    print(i)
Example #42
0
def rename_file(filename, path):
    # given a file and a path to a pdf downloaded from drf.com with either the time
    # or beyer figure results from a race (ex. TUP--04-30-2017 (1).pdf) extracts the
    # name, date, and number of the race, as well as the horses that ran in it, checks
    # that the former matches the filename, and renames the file.
    #  (ex. TUP170430_2_lb.pdf (beyer figure) or TUP170430_2_lt.pdf (time)).
    # returns the list of horses and the new filename as a tuple or None if the name doesn't
    # match the file contents.
    if write_flag:
        fw = open('new_pdfs.txt', "w")
    # group: 0-filename, 1-track, 2-month, 3-day, 4-year, 5-race number
    progN = re.compile(r'([A-Z]+)(\d{2})(\d{2})(\d{2})_(\d+)_(l[bt])\.pdf'
                       )  # if file is already renamed
    prog = re.compile(r'([A-Z]+)--(\d+)-(\d+)-(\d+) ?\(?(\d*)\)?\.pdf'
                      )  # if file hasn't yet been renamed
    if progN.fullmatch(filename):
        m = progN.match(filename)
        num = int(m.group(5))
        date = months[int(m.group(3)) -
                      1] + ' ' + str(int(m.group(4)) + ',  20' + m.group(2))
        track = race_abbrev[m.group(1)]
        new_name = filename
    elif prog.fullmatch(filename):
        m = prog.match(filename)
        num = m.group(5)
        if num == '':
            num = 1
        else:
            num = int(num) + 1
        date = months[int(m.group(2)) - 1] + ' ' + str(int(
            m.group(3))) + ',  ' + m.group(4)
        track = race_abbrev[m.group(1)]
        new_name = m.group(1) + m.group(4)[-2:] + m.group(2) + m.group(
            3) + "_" + str(num)
        if timefolder in path:
            new_name += time_ending
        else:
            new_name += beyer_ending
        shutil.copy(os.path.join(path, filename),
                    os.path.join(new_pdf_folder, new_name))
    else:
        shutil.copy(os.path.join(path, filename),
                    os.path.join(incorrectfiles, filename))
        return None, None, None, None
    if new_name.replace('.pdf', '.csv') in all_csvs_list:
        return None, None, None, None
    race = races[num - 1]
    # open file as a pdfminer layout to be parsed
    fp = open(os.path.join(path, filename), 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    unmatched = None
    horses = None
    top = None
    bottom = None
    # parse the pdf to make sure the name, date, and track match, and get a list of the horse names
    for page in doc.get_pages():
        if unmatched or horses:
            break
        interpreter.process_page(page)
        layout = device.get_result()
        unmatched, horses, top, bottom = parse_layout(layout, None, [], None,
                                                      None, race, date, track)
    # copy the file to a new folder with the corrected file name if it contained the key words
    fp.close()
    if unmatched == []:
        if write_flag:
            fw.write(new_name + "\n")
        else:
            return horses, new_name, top, bottom
    else:
        shutil.copy(os.path.join(path, filename),
                    os.path.join(incorrectfiles, filename))
        return None, None, None, None
Example #43
0
    def _extract_ms(self, file_name):
        print "Extracting MS"

        self._new_questions = []
        self._questions.reverse()
        current_question = self._questions.pop()

        # Load PDF
        laparams = LAParams()
        rsrcmgr = PDFResourceManager()
        document = file(file_name, 'rb')
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for i, page in enumerate(PDFPage.get_pages(document)):
            if i != 3:
                continue

            # Convert pdf page to png
            img_path = os.path.join(self._folder, "temp.png")
            with Image(filename=file_name + "[{}]".format(i),
                       resolution=int(72 * Paper.QUALITY)) as img:

                with Image(width=img.width,
                           height=img.height,
                           background=Color("white")) as background:
                    background.composite(img, 0, 0)
                    background.save(filename=img_path)

            # Load png to crop
            with Image(filename=img_path) as img:
                # Set initial crop
                x = 28 * Paper.QUALITY
                y = 0
                width = 567 * Paper.QUALITY
                height = img.height

                # Get page layout
                interpreter.process_page(page)
                layout = device.get_result()

                # Extract metadata
                textboxes = [
                    r for r in layout._objs
                    if isinstance(r, LTTextBoxHorizontal)
                ]

                ms_top = 0

                for textbox in textboxes:
                    text = textbox.get_text()

                    if text in [
                            "Solution \n", "Mark \n", "Total \n", "Comment \n"
                    ]:
                        if ms_top == text.y0 or ms_top == 0:
                            ms_top = text.y0
                        else:
                            pass
                            # Crop up to this point and reset (2 questions on page)
                    else:
                        print repr(text)

                path = os.path.join(self._folder, "m{}.png".format(i))
                img.crop(x, y, width, height)
                img.save(filename=path)

                # # Extract metadata
                # textboxes = [r for r in layout._objs if isinstance(
                #     r, LTTextBoxHorizontal)]
                # q_num = 0
                # q_y = 0
                # q_height = 0
                # for textbox in textboxes:
                #     text = textbox.get_text()

                #     q_passed = False
                #     if text.startswith("Q"):
                #         q_passed = True
                #         try:
                #             q_num = int(text[1])
                #         except ValueError:
                #             pass
                #     elif text.startswith("\nQ"):
                #         q_passed = True
                #         try:
                #             q_num = int(text[3])
                #         except ValueError:
                #             pass

                #     if q_passed:
                #         if q_y > 0:
                #             y = q_y
                #             new_q_y = img.height - int(textbox.y1) - 4
                #             height = new_q_y - q_y
                #             self._save_cropped("m{}".format(
                #                 q_num), img, img_x, y, img_width, height)
                #             q_y = new_q_y
                #         else:
                #             q_y = img.height - int(textbox.y1) - 4

                # if q_num > 0:
                #     self._save_cropped("m{}".format(
                #         q_num), img, img_x, q_y, img_width, img_height)

            os.remove(img_path)
Example #44
0
# ab+   以二进制读写模式打开

fp = open('A Compact and Embedded Balanced.pdf', 'rb')  #获取文档对象  #rb --> 以二进制读的模式打开
#若要读取网络上的PDF:
#from urllib.request import urlopen
#fp = urlopen('http://gk.xmu.edu.cn/_upload/article/files/8f/f0/03862022456bb3a0421b8be3223d/90eeb364-6195-4cd3-958b-5bcc0bb71b99.pdf')

parser = PDFParser(fp)  #创建一个与文档关联的解释器

doc = PDFDocument()  #PDF文档对象

parser.set_document(doc)  #连接解释器和文档对象
doc.set_parser(parser)    #连接解释器和文档对象

doc.initialize('')  #初始化文档  #因为文档没有密码,所以为空

resource = PDFResourceManager()  #创建PDF资源管理器

laparam = LAParams()  #参数分析器

device = PDFPageAggregator(resource, laparams=laparam)  #创建聚合器=资源管理器+参数分析器

interpreter = PDFPageInterpreter(resource, device)  #创建PDF页面解释器

for page in doc.get_pages():  #使用文档对象得到页面的集合
    interpreter.process_page(page)  #使用页面解释器来读取
    layout = device.get_result()  #使用聚合器来获得内容
    for out in layout:
        if hasattr(out, 'get_text'):  #判断out是否有get_text这个属性
            print(out.get_text())
Example #45
0
def scan_PDF():
    log = []
    for files in os.walk(exe_path):  # 遍历文件夹下的文件
        for i in range(0, len(files[2])):  #[0]是根目录,[1]是子文件夹,[2]是子文件
            if os.path.splitext(files[2][i])[1] == '.pdf' or os.path.splitext(
                    files[2]
                [i])[1] == '.PDF':  # 验证后缀名是否为pdf,os.path.splitext方法用来分割后缀名
                file_name = files[2][i]
                try:
                    #*******************************************************获取数据********************************************************
                    file = open(os.path.join(exe_path, file_name), 'rb')
                    parser = PDFParser(
                        file
                    )  # Translate the binary file to recognizable datastream (PDFParser object)
                    document = PDFDocument()  # 创建一个PDF文档
                    parser.set_document(document)  # 连接分析文档
                    document.set_parser(parser)
                    document.initialize()  # pdf 初始化
                    resource = PDFResourceManager()
                    laparams = LAParams()
                    device = PDFPageAggregator(
                        resource, laparams=laparams)  # 创建一个PDF设备对象
                    interpreter = PDFPageInterpreter(resource,
                                                     device)  # 创建一个PDF解释器对象
                    content = []
                    i = 0
                    for page in document.get_pages():
                        interpreter.process_page(
                            page
                        )  # 解释器完成解释后,将内容传送至device里(device里包含资源管理器的PDF资源)
                        layout = device.get_result()
                        for raw_data in layout:
                            if (
                                    isinstance(raw_data, LTTextBoxHorizontal)
                            ):  # 这里来判断取得的元素,我们这里要取得的是TextBox内容,其他内容或样例请直接print(x)查看
                                #print(i,raw_data.get_text())
                                content.append(
                                    raw_data.get_text())  # 将获得的内容赋值至content中
                                #i=i+1
    #**********************************************************************************************************************

    #*******************************************************识别数据********************************************************
                    tag = 0  # 验证是否存在重大缺陷等问题存在,如发现异常则加1
                    validation = -1  # 验证是否存在数据无法读取,默认为无法读取,能读取数据后赋值为0
                    for i in range(0, len(content)):
                        if content[i].split(" ")[0].find("公司代码") > -1:
                            print("公司代码: " +
                                  content[i].split(":")[1].split(" ")[0])
                            sec_code = content[i].split(":")[1].split(" ")[
                                0]  # 读取证券代码
                            print("公司简称: " + content[i].split(":")[2])
                            sec_name = content[i].split(
                                ":")[2].strip()  # 读取证券名称
                            validation = 0
                        if (content[i].find("重大缺陷") > -1
                                or content[i].find("重要缺陷") > -1
                                or content[i].find("一致") > -1
                            ) and content[i + 1].find("√") > -1:
                            #print(content[i])
                            #print(content[i+1])
                            if content[i].find("一致") > -1:
                                question = content[i].split(" ")[1]
                                if content[i + 1].find("√否") > -1:
                                    answer = "不一致"
                                    tag = tag + 1
                                elif content[i + 1].find("√是") > -1:
                                    answer = "一致"
                                else:
                                    continue
                                log.append(
                                    [sec_code, sec_name, question, answer])
                            elif (content[i].find("重大缺陷") > -1
                                  and content[i].split(" ")[1] == "重大缺陷") or (
                                      content[i].find("重要缺陷") > -1
                                      and content[i].split(" ")[1] == "重要缺陷"):
                                question = content[i + 1].split(" ")[0]
                                if content[i + 1].find("√是") > -1:
                                    answer = "是"
                                    tag = tag + 1
                                elif content[i + 1].find("√否") > -1:
                                    answer = "否"
                                else:
                                    continue
                                log.append(
                                    [sec_code, sec_name, question, answer])
                            elif (content[i].find("重大缺陷") > -1
                                  and content[i].split(" ")[1] != "重大缺陷") or (
                                      content[i].find("重要缺陷") > -1
                                      and content[i].split(" ")[1] != "重要缺陷"):
                                question = content[i].split(" ")[1]
                                if content[i + 1].find("√是") > -1:
                                    answer = "是"
                                    tag = tag + 1
                                elif content[i + 1].find("√否") > -1:
                                    answer = "否"
                                else:
                                    continue
                                log.append(
                                    [sec_code, sec_name, question, answer])
                            else:
                                continue
                    print(tag, validation)

                    if tag > 0:  # 如果tag大于0,表示有异常,则将异常结果写入文件
                        file = open(os.path.join(exe_path, 'Final_Result.txt'),
                                    'a+')  # 写入访问记录
                        file.write(sec_code + " " + sec_name + "\r\n")
                        file.close()

                    if validation == -1:  # 如果等于-1则为数据无法读取,需要人工核查
                        file = open(os.path.join(exe_path, 'Error_Log.txt'),
                                    'a+')  # 写入访问记录
                        file.write(os.path.join(exe_path, file_name) + "\r\n")
                        file.close()
    #***********************************************************************************************************************
                except:  # 如果文件本身错误,则写入错误日志,待人工核查
                    file = open(os.path.join(exe_path, 'Error_Log.txt'),
                                'a+')  # 写入访问记录
                    file.write(os.path.join(exe_path, file_name) + "\r\n")
                    file.close()
                else:
                    continue

    workbook = xlsxwriter.Workbook(os.path.join(exe_path, "Log_File.xlsx"))
    worksheet = workbook.add_worksheet("Log_File")

    columns = ['Sec_Code', 'Sec_Name', 'Question', 'Answer']
    for i in range(0, 4):
        worksheet.write(0, i, columns[i])

    for i in range(0, len(log)):
        for j in range(0, 4):
            worksheet.write(i + 1, j, log[i][j])

    workbook.close()

    print("识别完成")
    os.system('pause')
Example #46
0
class PdfFileParser(object):
    def __init__(self,
                 infile,
                 outfile=None,
                 password=None,
                 selectedpages=None,
                 maxSplit=3,
                 W=1440.0,
                 H=1080.0,
                 outputJson=False,
                 trimbox=None,
                 trimboxes=None,
                 exclude=False,
                 debug=0):
        self.args = {
            a[0]: a[1]
            for a in locals().items() if a[0] not in ['self', 'outputJson']
        }
        self.outputJson = outputJson
        self.DEBUG = debug
        self.picklefile = infile + '.pickle'
        self.selectedpages = selectedpages
        self.pickleLoaded = False
        self.savedconfig = None
        self.coords = []
        self.pagesCoords = []
        self.trimbox = trimbox
        self.trimboxes = trimboxes
        self.exclude = exclude

        self.pageRanges = SelectedPages(selectedpages)

        if ENABLE_PICKLE and os.path.isfile(self.picklefile):
            try:
                with open(self.picklefile, 'rb') as f:
                    self.savedconfig = pickle.load(f)
                    savedargs = self.savedconfig['args']
                    equal = True
                    for k, v in self.args.items():
                        if k == 'selectedpages':
                            if v not in SelectedPages(savedargs[k]):
                                equal = False
                        elif k not in savedargs:
                            equal = False
                        elif v != savedargs[k]:
                            equal = False
                        if not equal:
                            break

                    if equal:
                        self.pickleLoaded = True
                        self.pagesCoords = self.savedconfig['pagesCoords']
            except Exception, e:
                print e

        self.fname = infile
        self.W = float(W)
        self.H = float(H)
        self.maxSplit = maxSplit
        self.outfile = outfile
        if self.outfile == None:
            outFilename, outExt = os.path.splitext(infile)
            self.outfile = outFilename + '-out' + outExt
            if not (self.selectedpages == None or self.selectedpages == ''):
                outFilename, outExt = os.path.splitext(self.outfile)
                self.outfile = '%s(%s)%s' % (outFilename, self.selectedpages,
                                             outExt)
        if os.path.isfile(self.outfile):
            i = 1
            outfile, outExt = os.path.splitext(self.outfile)
            while os.path.isfile("%s(%d)%s" % (outfile, i, outExt)):
                i += 1
            self.outfile = "%s%d%s" % (outfile, i, outExt)

        self.password = password
        self.endPage = self.pageRanges.getEndPage(
            30000) - 1  # 1 base vs 2 base

        self.inFile = open(self.fname, 'rb')
        self.parser = PDFParser(self.inFile)
        self.document = PDFDocument(self.parser)
        self.rsrcmgr = PDFResourceManager()
        self.laparams = LAParams()
        if not self.pickleLoaded:
            self.device = PDFPageAggregator(self.rsrcmgr,
                                            laparams=self.laparams)
            self.interpreter = PDFPageInterpreter(self.rsrcmgr, self.device)
            self.pagesEnumerator = enumerate(
                PDFPage.create_pages(self.document))
Example #47
0
from pdfminer.layout import *
from pdfminer.converter import PDFPageAggregator

fp = open('D:\\temp\\pdf_html\\data\\sub\\1202098108.pdf', 'rb')
#来创建一个pdf文档分析器
parser = PDFParser(fp)
#创建一个PDF文档对象存储文档结构
document = PDFDocument(parser)
# 检查文件是否允许文本提取
if not document.is_extractable:
    raise PDFTextExtractionNotAllowed
else:
    # 创建一个PDF资源管理器对象来存储共赏资源
    rsrcmgr = PDFResourceManager()
    # 设定参数进行分析
    laparams = LAParams()
    # 创建一个PDF设备对象
    # device=PDFDevice(rsrcmgr)
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    # 创建一个PDF解释器对象
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # 处理每一页
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        # 接受该页面的LTPage对象
        layout = device.get_result()
        for x in layout:
            if isinstance(x, LTTextBoxHorizontal):
                print x, x.get_text()
                # with open('a.txt','a') as f:
                #     f.write(x.get_text().encode('utf-8')+'\n')
Example #48
0
    def get_columns(fname):
        document = open(fname, 'rb')

        rsrcmgr = PDFResourceManager()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        pages = []
        for page in PDFPage.get_pages(document):
            pages.append(page)
        num_pages = len(pages)

        doc_part_rects = []
        doc_text_boxes = []

        leftmost_x = 150
        rightmost_x = 350
        minimum_height_top = 150
        minimum_height_bottom = 250

        round_up = 4

        page_num = 0
        for page in pages:

            interpreter.process_page(page)
            layout = device.get_result()

            doc_text_boxes.append([])
            doc_part_rects.append([])

            topmost_y = 900 * (num_pages - page_num)
            bottommost_y = 900 * (num_pages - page_num - 1)

            # assume there is a single rectangle that partions the page into 2 sections

            part_rects = [(leftmost_x, bottommost_y, rightmost_x, topmost_y)]
            layout = sorted(layout,
                            key=lambda element:
                            (element.bbox[2] - element.bbox[0]))

            # for each text box eliminate possibility of possible partion rectangles
            for element in layout:
                if isinstance(element, LTTextBoxHorizontal):
                    element.bbox = list(element.bbox)
                    element.bbox[1] = (int(element.bbox[1]) /
                                       round_up) * round_up + bottommost_y
                    element.bbox[3] = (int(element.bbox[3]) /
                                       round_up) * round_up + bottommost_y

                    doc_text_boxes[page_num].append(element)

                    new_part_rects = []
                    for part_rect in part_rects:
                        if not overlap_rects(part_rect, element.bbox):
                            new_part_rects.append(part_rect)
                        else:
                            if element.bbox[0] <= part_rect[
                                    0] and element.bbox[2] >= part_rect[2]:

                                if (part_rect[3] -
                                        element.bbox[3]) > minimum_height_top:
                                    new_part_rects.append(
                                        (part_rect[0], element.bbox[3],
                                         part_rect[2], part_rect[3]))

                                if (element.bbox[1] -
                                        part_rect[1]) > minimum_height_bottom:
                                    new_part_rects.append(
                                        (part_rect[0], part_rect[1],
                                         part_rect[2], element.bbox[1]))

                            elif element.bbox[0] <= part_rect[
                                    0] and element.bbox[2] > part_rect[0]:

                                new_part_rects.append(
                                    (element.bbox[2], part_rect[1],
                                     part_rect[2], part_rect[3]))

                                if part_rect[3] == topmost_y and (
                                        part_rect[3] -
                                        element.bbox[3]) > minimum_height_top:
                                    new_part_rects.append(
                                        (part_rect[0], element.bbox[3],
                                         part_rect[2], part_rect[3]))

                                if part_rect[1] == bottommost_y and (
                                        element.bbox[1] -
                                        part_rect[1]) > minimum_height_bottom:
                                    new_part_rects.append(
                                        (part_rect[0], part_rect[1],
                                         part_rect[2], element.bbox[1]))

                            elif element.bbox[0] < part_rect[
                                    2] and element.bbox[2] >= part_rect[2]:

                                new_part_rects.append(
                                    (part_rect[0], part_rect[1],
                                     element.bbox[0], part_rect[3]))

                                if part_rect[3] == topmost_y and (
                                        part_rect[3] -
                                        element.bbox[3]) > minimum_height_top:
                                    new_part_rects.append(
                                        (part_rect[0], element.bbox[3],
                                         part_rect[2], part_rect[3]))

                                if part_rect[1] == bottommost_y and (
                                        element.bbox[1] -
                                        part_rect[1]) > minimum_height_bottom:
                                    new_part_rects.append(
                                        (part_rect[0], part_rect[1],
                                         part_rect[2], element.bbox[1]))

                            elif element.bbox[0] > part_rect[
                                    0] and element.bbox[2] < part_rect[2]:

                                new_part_rects.append(
                                    (part_rect[0], part_rect[1],
                                     element.bbox[0], part_rect[3]))

                                new_part_rects.append(
                                    (element.bbox[2], part_rect[1],
                                     part_rect[2], part_rect[3]))

                                if part_rect[3] == topmost_y and (
                                        part_rect[3] -
                                        element.bbox[3]) > minimum_height_top:
                                    new_part_rects.append(
                                        (part_rect[0], element.bbox[3],
                                         part_rect[2], part_rect[3]))

                                if part_rect[1] == bottommost_y and (
                                        element.bbox[1] -
                                        part_rect[1]) > minimum_height_bottom:
                                    new_part_rects.append(
                                        (part_rect[0], part_rect[1],
                                         part_rect[2], element.bbox[1]))

                            elif element.bbox[0] == part_rect[
                                    2] or element.bbox[2] == part_rect[0]:
                                new_part_rects.append(part_rect)
                            else:
                                print(part_rect)
                                print(element.bbox)
                                raise Exception(
                                    "Unhandled case in overlaping rectangles")

                    part_rects = new_part_rects

            largest_lower_rect_height = 0
            largest_upper_rect_height = 0
            largest_lower_rect = None
            largest_upper_rect = None
            single_largest_rect_present = False
            single_largest_rect = None

            for part_rect in part_rects:
                if part_rect[1] == bottommost_y and part_rect[3] == topmost_y:
                    single_largest_rect_present = True
                    single_largest_rect = part_rect
                    break
                elif part_rect[1] == bottommost_y and (
                        part_rect[3] -
                        part_rect[1]) > largest_lower_rect_height:
                    largest_lower_rect = part_rect
                    largest_lower_rect_height = (part_rect[3] - part_rect[1])
                elif part_rect[3] == topmost_y and (
                        part_rect[3] -
                        part_rect[1]) > largest_upper_rect_height:
                    largest_upper_rect = part_rect
                    largest_upper_rect_height = (part_rect[3] - part_rect[1])

            if single_largest_rect_present:
                doc_part_rects[page_num].append(single_largest_rect)

            else:
                if largest_lower_rect:
                    doc_part_rects[page_num].append(largest_lower_rect)
                if largest_upper_rect:
                    doc_part_rects[page_num].append(largest_upper_rect)

            page_num += 1

        flat_doc_text_boxes = []
        flat_doc_part_rects = []

        column_score = 0
        for i in range(num_pages):
            text_section_height_score = (
                max(doc_text_boxes[i],
                    key=lambda element: element.bbox[3]).bbox[3] -
                min(doc_text_boxes[i],
                    key=lambda element: element.bbox[1]).bbox[1]) / 900
            for part_rect in doc_part_rects[i]:
                column_score += (part_rect[3] -
                                 part_rect[1]) * text_section_height_score
                flat_doc_part_rects.append(part_rect)
            for element in doc_text_boxes[i]:
                flat_doc_text_boxes.append(element)

        column_score = column_score / (900 * num_pages)
        if column_score < 0.4:
            flat_doc_part_rects = [flat_doc_part_rects[0]]

        return flat_doc_text_boxes, flat_doc_part_rects, num_pages
 def render_image(self, name, stream):
     if self.imagewriter is None:
         return
     PDFPageAggregator.render_image(self, name, stream)
     return
Example #50
0
 def __init__(self, filename, laparams = None):
     self.fp = open(filename, 'rb')
     resources = PDFResourceManagerNew()
     self.device = PDFPageAggregator(resources, laparams=laparams)
     self.interpreter = PDFPageInterpreter(resources, self.device)
     self.val = dict()
Example #51
0
    def load_file_text(self, import_file):
        """ Import from file types of odt, docx pdf, epub, txt, html, htm.
        """

        text = ""

        # Import from odt
        if import_file[-4:].lower() == ".odt":
            text = self.convert_odt_to_text(import_file)
        # Import from docx
        if import_file[-5:].lower() == ".docx":
            #text = convert(importFile)  # uses docx_to_html
            document = opendocx(import_file)
            list_ = getdocumenttext(document)
            text = "\n".join(list_)
        # Import from epub
        if import_file[-5:].lower() == ".epub":
            book = epub.read_epub(import_file)
            for d in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
                #print(d.get_content())
                bytes_ = d.get_body_content()
                string = bytes_.decode('utf-8')
                text += html_to_text(string) + "\n"
        # import PDF
        if import_file[-4:].lower() == '.pdf':
            fp = open(import_file, 'rb')  # read binary mode
            parser = PDFParser(fp)
            doc = PDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
            # potential error with encrypted PDF
            doc.initialize('')
            rsrcmgr = PDFResourceManager()
            laparams = LAParams()
            laparams.char_margin = 1.0
            laparams.word_margin = 1.0
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in doc.get_pages():
                interpreter.process_page(page)
                layout = device.get_result()
                for lt_obj in layout:
                    if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                        text += lt_obj.get_text()
        # import from html
        if import_file[-5:].lower() == ".html" or import_file[-4:].lower() == ".htm":
            importErrors = 0
            with open(import_file, "r") as sourcefile:
                fileText = ""
                while 1:
                    line = sourcefile.readline()
                    if not line:
                        break
                    fileText += line
                text = html_to_text(fileText)
                QtWidgets.QMessageBox.warning(None, _('Warning'), str(importErrors) + _(" lines not imported"))
        # Try importing as a plain text file.
        if text == "":
            import_errors = 0
            try:
                with open(import_file, "r") as sourcefile:
                    while 1:
                        line = sourcefile.readline()
                        if not line:
                            break
                        try:
                            text += line
                        except Exception as e:
                            #logger.debug("Importing plain text file, line ignored: " + str(e))
                            import_errors += 1
                    if text[0:6] == "\ufeff":  # associated with notepad files
                        text = text[6:]
            except Exception as e:
                QtWidgets.QMessageBox.warning(None, _('Warning'),
                    _("Cannot import ") + str(import_file) + "\n" + str(e))
                return
            if import_errors > 0:
                QtWidgets.QMessageBox.warning(None, _('Warning'),
                    str(import_errors) + _(" lines not imported"))
                logger.warning(import_file + ": " + str(import_errors) + _(" lines not imported"))
        # import of text file did not work
        if text == "":
            QtWidgets.QMessageBox.warning(None, _('Warning'),
                _("Cannot import ") + str(import_file) + "\n" + str(e))
            return
        # Final checks: check for duplicated filename and update model, widget and database
        nameSplit = import_file.split("/")
        filename = nameSplit[-1]
        if any(d['name'] == filename for d in self.source):
            QtWidgets.QMessageBox.warning(None, _('Duplicate file'),
                _("Duplicate filename.\nFile not imported"))
            return
        entry = {'name': filename, 'id': -1, 'fulltext': text, 'mediapath': None, 'memo': "",
        'owner': self.settings['codername'], 'date': datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
        cur = self.settings['conn'].cursor()
        #logger.debug("type fulltext: " + str(type(entry['fulltext'])))
        cur.execute("insert into source(name,fulltext,mediapath,memo,owner,date) values(?,?,?,?,?,?)",
            (entry['name'],  entry['fulltext'], entry['mediapath'], entry['memo'], entry['owner'], entry['date']))
        self.settings['conn'].commit()
        cur.execute("select last_insert_rowid()")
        id_ = cur.fetchone()[0]
        entry['id'] = id_
        self.parent_textEdit.append(entry['name'] + _(" imported."))
        self.source.append(entry)
Example #52
0
 def __init__(self, rsrcmgr, pageno=1, laparams=None):
     PDFPageAggregator.__init__(self, rsrcmgr, pageno=pageno, laparams=laparams)
     self.rows = []
     self.page_number = 0
Example #53
0
parser = PDFParser(fp)
# # Create a PDF document object that stores the document structure.
doc = PDFDocument()
# # Connect the parser and document objects.
parser.set_document(doc)
doc.set_parser(parser)
# Supply the password for initialization.
# # (If no password is set, give an empty string.)
# doc.initialize(password)
# # Check if the document allows text extraction. If not, abort.
#if not doc.is_extractable:
#     raise PDFTextExtractionNotAllowed
#     # Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
#     # Create a PDF device object.
#device = PDFDevice(rsrcmgr)
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
#     # Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
#     # Process each page contained in the document.
text_content = []
for page in doc.get_pages():
    interpreter.process_page(page)
    lt_objs = device.get_result()
    for lt_obj in lt_objs:
        if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
            #page_text = update_page_text_hash(page_text, lt_obj)
            print lt_obj.get_text()
            print "\n \n new =================="
Example #54
0
class StatementReader:
    """
    Reader Class of a pdf statement
    """
    def __init__(self, file: Union[str, Path]):
        self.start_balance = None
        self.transaction_list = []
        self.f = open(file, 'rb')
        resource_manager = PDFResourceManager()
        params = LAParams()
        self.device = PDFPageAggregator(resource_manager, laparams=params)
        self.interpreter = PDFPageInterpreter(resource_manager, self.device)

    def read_statement(self):
        """
        Read the pdf statement pages
        :return: statement pages
        """
        # Process each page contained in the statement.
        page_list = []
        for page in PDFPage.get_pages(self.f):
            str_list = self.read_page(page)

            page_list.append(str_list)
        return page_list

    def read_page(self, page: Iterator[PDFPage]):
        """
        Read a page from the statement
        :param page: statement page
        :return: strings of the page
        """
        characters = []
        self.interpreter.process_page(page)
        layout = self.device.get_result()
        for box in layout:
            if isinstance(box, LTTextBoxHorizontal):
                characters.extend(extract_characters(box))
        # Create list of characters
        char_list = [
            Char(char) for char in characters if isinstance(char, LTChar)
        ]
        char_list = sorted(char_list, key=lambda char: char.y0, reverse=True)
        # Attribute a row number to each character
        char_list[0].row = 0
        for i in range(1, len(char_list)):
            if (char_list[i - 1].y0 - char_list[i].y0) > CHAR_HEIGHT / 2:
                char_list[i].row = char_list[i - 1].row + 1
            else:
                char_list[i].row = char_list[i - 1].row
        char_list = sorted(char_list, key=lambda char: (char.row, char.x0))
        # Create list of strings
        str_list = []
        previous_row = char_list[0].row
        previous_col = char_list[0].col
        i = 1
        while i < len(char_list):
            current_row = char_list[i].row
            current_col = char_list[i].col
            current_col_name = char_list[i].col_name
            string = String(current_row, current_col, current_col_name)
            while True and i < len(char_list):
                if (char_list[i].row == previous_row
                        and char_list[i].col == previous_col):
                    if (char_list[i].x0 - char_list[i - 1].x1) > CHAR_WIDTH:
                        string.text = ' '.join(
                            (string.text, char_list[i].text))
                    else:
                        string.text = ''.join((string.text, char_list[i].text))
                else:
                    previous_row = char_list[i].row
                    previous_col = char_list[i].col
                    string.clean()
                    str_list.append(string)
                    break
                i = i + 1
        str_list = iter(sorted(str_list, key=lambda x: (x.row, x.col)))
        return str_list

    def get_statement_details(self):
        """
        Map the strings from the all the statement pages to attributes
        """
        page_list = self.read_statement()
        for str_list in page_list:
            self.get_transaction_details(str_list)

    def get_transaction_details(self, str_list: Iterator[String]):
        """
        Map the strings from the page to attributes
        :param str_list: string list of a page
        """
        while True:
            string = next(str_list, None)
            if string is None:
                break
            # First BALANCE BROUGHT FORWARD
            elif string.text == 'BALANCE BROUGHT FORWARD':
                string = next(str_list)
                if self.start_balance is None:
                    # Some time, there is a '.' in the first line so we pass it
                    if string.text == '.':
                        string = next(str_list)
                    self.start_balance = to_float(string.text)
                string = next(str_list)
                # Last BALANCE BROUGHT FORWARD
                while string.text != 'BALANCE CARRIED FORWARD':
                    current_row = string.row
                    new_transaction = False
                    date = None
                    method_symbol = None
                    entity = None
                    amount = 0
                    while string.row == current_row:
                        if string.text == 'BALANCE CARRIED FORWARD':
                            break
                        if string.col_name == 'date':
                            date = to_date_str(to_date(string.text))
                        elif string.col_name == 'payment_type':
                            method_symbol = string.text
                            new_transaction = True
                        elif string.col_name == 'entity':
                            entity = string.text
                        elif string.col_name == 'paid_out':
                            amount = amount - to_float(string.text)
                        elif string.col_name == 'paid_in':
                            amount = amount + to_float(string.text)
                        elif string.col_name == 'balance':
                            pass
                        else:
                            raise ValueError('col name not found')
                        string = next(str_list)
                    else:
                        if new_transaction:
                            if date is None:
                                prev_transaction = self.transaction_list[-1]
                                date = prev_transaction['date']
                            transaction = dict(date=date,
                                               method=METHOD[method_symbol],
                                               method_symbol=method_symbol,
                                               entity=entity,
                                               amount=amount,
                                               ccy=CCY,
                                               account=ACCOUNT)
                            self.transaction_list.append(transaction)
                        else:
                            prev_transaction = self.transaction_list[-1]
                            prev_transaction['amount'] = amount
                            prev_transaction['entity'] = ' '.join(
                                (prev_transaction['entity'], entity))
                            self.transaction_list[-1] = prev_transaction
                else:
                    break

    def close_statement(self):
        """
        Close the pdf statement
        """
        self.f.close()
        self.device.close()
Example #55
0
def parse_pdf(pdf_path):
    """
    读取pdf文件,保存为
    待修改:如果不能直接读取文字,尝试使用Tesseract (OCR庫)
    :param pdf_path:
    :return:
    """
    fp = open(pdf_path, 'rb')  # 以二进制读模式打开
    # 用文件对象来创建一个pdf文档分析器
    parser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器 与文档对象
    parser.set_document(doc)
    doc.set_parser(parser)

    # 提供初始化密码
    # 如果没有密码 就创建一个空的字符串
    doc.initialize()

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf 资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # 用来计数页面,图片,曲线,figure,水平文本框等对象的数量
        num_page, num_image, num_curve, num_figure, num_TextBoxHorizontal = 0, 0, 0, 0, 0

        # 循环遍历列表,每次处理一个page的内容
        for page in doc.get_pages(): # doc.get_pages() 获取page列表
            num_page += 1  # 页面增一
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            for x in layout:
                if isinstance(x,LTImage):  # 图片对象
                    num_image += 1
                if isinstance(x,LTCurve):  # 曲线对象
                    num_curve += 1
                if isinstance(x,LTFigure):  # figure对象
                    '''2018.11.27 修改 发现部分pdf文本被识别为LFTFigure对象,将该文本内容添加入结果'''
                    num_figure += 1
                    # 保存文本内容
                    new_path = pdf_path[:-3]+'txt'
                    with open(new_path, 'a',encoding='utf8') as f:
                        x.get_textboxes()
                        for x_in in x:
                            if isinstance(x_in, LTChar):
                                results = x_in.get_text()
                                f.write(results)

                if isinstance(x, LTTextBoxHorizontal):  # 获取文本内容
                    num_TextBoxHorizontal += 1  # 水平文本框对象增一
                    # 保存文本内容
                    new_path = pdf_path[:-3]+'txt'
                    with open(new_path, 'a',encoding='utf8') as f:
                        results = x.get_text()
                        f.write(results + '\n')
        print('对象数量:\n','页面数:%s\n'%num_page,'图片数:%s\n'%num_image,'曲线数:%s\n'%num_curve,'水平文本框:%s\n'
              %num_TextBoxHorizontal)
Example #56
0
def parse(fileName):
    text_path = upload_path + fileName + ".pdf"
    hmPdfSaveName = ""
    fileOpen = open(text_path,'rb')
    doc = PDFDocument()
    parser = PDFParser(fileOpen)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize()
    #原文件
    hmPdfReaderPDF = PyPDF2.PdfFileReader(fileOpen)

    #待写入数据文件
    hmPdfWriter = PyPDF2.PdfFileWriter()
    #检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        #创建PDF,资源管理器,来共享资源
        rsrcmgr = PDFResourceManager()
        #创建一个PDF设备对象
        device = PDFPageAggregator(rsrcmgr,laparams=LAParams())
        #创建一个PDF解释其对象
        interpreter = PDFPageInterpreter(rsrcmgr,device)
        openFileArr = []
        allPages = doc.get_pages()
        for page in allPages:
            interpreter.process_page(page)
            layout = device.get_result()

            textValueArr = []
            for x in layout:
                if(isinstance(x,LTTextBoxHorizontal)):
                    textValueArr.append(x.get_text())
            pdfTxt = ''.join(textValueArr)
            textValueArr.clear()
            
            dpText = bgGetProductDp(pdfTxt)

            gtList = ['CWF','CWFS','CWFH','CWFN','CSB']
            
            if (gtList.__contains__(dpText) == False):
                continue



            #init
            hmPd = HmProduct()
            #生产车间
            hmPd.productDp = dpText
            #生产单编号
            hmPd.productCasNum = bgGetProductInvoicesNum(pdfTxt)
            #产品编号
            hmPd.productNum = bgGetProductNumber(pdfTxt,hmPd)
            #规格
            hmPd.productSf = bgGetProductSpecification(pdfTxt,hmPd)
            #销售单号
            hmPd.productSealNum = bgGetProductSealNum(pdfTxt)
            #订单数量/单位
            hmPd.productCount = getHmProductCount(pdfTxt,hmPd)
            #客人号
            hmPd.productGuest  = bgGetProductGuest(pdfTxt)
            #产品批次
            hmPd.productBatch = bgGetProductBatch(pdfTxt)

            #产品中文描述
            hmPd.productRamk = bgGetProductDetilRamk(pdfTxt,hmPd)
            #生成生产单uuid
            hmPd.hm_pd_uuid = bgGetPageMd5(hmPd)

            #根据总表更新生产单日期
            #updateOutDate(hmPd,pdfTxt)
            
            #----------------1、生成文件_STAR----------------#
            payStr = ''
            layoutPageId = layout.pageid - 1 

            #生成【眼】单
            if hmPd.hm_pd_uuid :
                newPage = hmPdfReaderPDF.getPage(layoutPageId) 
                ePatch = hmCreateQRCode(hmPd,'E',payStr)
                eMarkFile = open(ePatch,'rb')
                pdfECodePage = PyPDF2.PdfFileReader(eMarkFile)
                newPage.mergePage(pdfECodePage.getPage(0))
                hmPdfWriter.addPage(newPage)
                openFileArr.append(eMarkFile)
                del newPage
                del pdfECodePage
                gc.collect()

            #用销售单号做文件名
            if hmPdfSaveName == "":
                hmPdfSaveName = hmPd.productSealNum
            #----------------1、生成文件_END----------------#

        #完结时关闭文件和保存文件
        #----------------生成文件时关闭----------------#
        nowTime = datetime.datetime.now()
        nowTimeStr = nowTime.strftime("%Y%m%d%H%M%S_s")
        hmPdfSaveName = nowTimeStr +"_"+ hmPdfSaveName+ ".pdf"
        hmPdfSavePath = download_path + hmPdfSaveName
        resultPdfFile = open(hmPdfSavePath,'wb')
        hmPdfWriter.write(resultPdfFile)
        for closeItem in openFileArr :
            closeItem.close()
            os.remove(closeItem.name)
        openFileArr.clear()
        resultPdfFile.close()

        fileOpen.close()
        return hmPdfSaveName
Example #57
0
def extract_text_from_pdf(pdf_path):
    new_dict = {}  #To store extracted data as key, value pairs
    lines = []  #To store data alternatively as list
    counter = 1  #increments when <END> of block is reached
    a = []  #dummy array to append elements of any section

    #Reset switches for data
    table = False
    partNo = False
    notes = False
    qty = False
    partname = False
    see = False
    ending = True
    new_dict["Metadata_%d" % counter] = {}

    #PDF Miner Objects
    resource_manager = PDFResourceManager()
    fake_file_handle = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    converter = PDFPageAggregator(resource_manager, laparams=laparams)
    page_interpreter = PDFPageInterpreter(resource_manager, converter)

    w = 0

    with open(pdf_path, 'rb') as fh:
        #text_from_pdf = open('text2FromPdf.txt','w')
        for pageNumber, page in enumerate(
                PDFPage.get_pages(fh, caching=True, check_extractable=True)):
            if pageNumber > 31 and pageNumber < 923:
                page_interpreter.process_page(page)
                layout = converter.get_result()
                for element in layout:
                    if isinstance(element, LTTextBox) or isinstance(
                            element, LTTextLine):
                        lines.extend(element.get_text().strip())

                        if (element.get_text().split()[0] == "<END>"):
                            #print(element.get_text().strip())
                            a = []
                            table = False
                            #see=False
                            #notes=False
                            #partNo=False
                            #qty=False
                            #partname=False
                            ending = True
                            pno = new_dict["Metadata_%d" %
                                           counter]["Part Numbers"][
                                               1:]  #Delete??,check last iter
                            #new_dict["Metadata_%d" %counter]["Part Numbers"]=pno
                            check = len(pno)
                            q = new_dict["Metadata_%d" %
                                         counter]["QTYs"][1:check + 1]
                            #new_dict["Metadata_%d" %counter]["QTYs"]=q
                            pname = new_dict["Metadata_%d" %
                                             counter]["PART NAMEs"][3:check +
                                                                    3]
                            #new_dict["Metadata_%d" %counter]["PART NAMEs"]=pname
                            if (len(pno) == len(q) and len(pno) == len(pname)):
                                new_dict3 = {
                                    i: {
                                        "q": j,
                                        "p/n": k
                                    }
                                    for i, j, k in zip(pname, q, pno)
                                }
                                #print(new_dict3)
                                new_dict[
                                    "Metadata_%d" %
                                    counter]["Parts/Components"] = new_dict3

                            #Delete table columns
                            #del new_dict["Metadata_%d" %counter]["Notes"]
                            del new_dict["Metadata_%d" %
                                         counter]["Part Numbers"]
                            del new_dict["Metadata_%d" % counter]["QTYs"]
                            del new_dict["Metadata_%d" % counter]["PART NAMEs"]

                            #replace metadata_counter with system name

                            super_list = new_dict["Metadata_%d" %
                                                  counter]["Super"]
                            sup_idx = [
                                i for i, item in enumerate(super_list)
                                if re.search('^PART', item)
                            ]
                            sn_idx = [
                                i for i, item in enumerate(super_list)
                                if re.search('^S/N', item)
                            ]

                            if sup_idx:
                                sup_idx = sup_idx[0]
                                #print(sup_idx)
                                #print(new_dict["Metadata_%d" %counter]["Super"][sup_idx])
                                new_dict["Metadata_%d" %
                                         counter]["Top"] = new_dict[
                                             "Metadata_%d" %
                                             counter]["Super"][sup_idx]
                            if sn_idx:
                                sn_idx = sn_idx[0]
                                #print(sn_idx)
                                this_idx = sn_idx - 1
                                new_dict["Metadata_%d" %
                                         counter]["Serial_No"] = new_dict[
                                             "Metadata_%d" %
                                             counter]["Super"][sn_idx]
                                newkey = new_dict["Metadata_%d" %
                                                  counter]["Super"][this_idx]
                                new_dict["Metadata_%d" %
                                         counter]["Component"] = newkey
                                new_dict[newkey] = new_dict["Metadata_%d" %
                                                            counter]
                                del new_dict["Metadata_%d" % counter]["Super"]
                                del new_dict["Metadata_%d" % counter]

                            counter = counter + 1
                            new_dict["Metadata_%d" % counter] = {}

                        elif (element.get_text().strip()) == "NOTE":

                            a = []
                            table = True
                            notes = True
                            partNo = False
                            qty = False
                            partname = False
                            see = False
                            ending = False

                        elif (element.get_text().split()[0]) == "PART" and len(
                                element.get_text().split()) > 1:
                            if (element.get_text().split()[1]) == "NUMBER":
                                a = []
                                partNo = True
                                table = True
                                notes = False
                                qty = False
                                see = False
                                ending = False
                            elif (element.get_text().split()[1]) == "NAME":
                                a = []
                                partname = True
                                table = True
                                notes = False
                                qty = False
                                see = False
                                ending = False

                        elif (element.get_text().strip().split()[0]) == "QTY":
                            a = []
                            qty = True
                            partNo = False
                            notes = False
                            partname = False
                            see = False
                            table = True
                            ending = False

                        #elif(element.get_text().strip())=="PART NAME":
                        # a=[];
                        #partname=True
                        #partNo=False
                        #notes=False
                        #qty=False
                        #see=False

                        elif (element.get_text().strip().split()[0]) == "SEE":
                            #print(element.get_text().strip().split()[0])
                            a = []
                            see = True
                            partNo = False
                            notes = False
                            qty = False
                            partname = False
                            table = True
                            ending = False

                        if table == False and element.get_text().split(
                        )[0] != "<END>":
                            a.append(element.get_text().strip())
                            new_dict["Metadata_%d" % counter]["Super"] = a
                        w = w + 1
                        if notes and table:
                            a.extend(element.get_text().strip().split('\n'))
                            #new_dict["Metadata_%d" %counter]["Notes"]=a
                        if partNo and table:
                            a.extend(element.get_text().strip().split('\n'))
                            new_dict["Metadata_%d" %
                                     counter]["Part Numbers"] = a
                        if qty and table:
                            a.extend(element.get_text().strip().split('\n'))
                            new_dict["Metadata_%d" % counter]["QTYs"] = a
                        if partname and table:
                            a.extend(element.get_text().strip().split('\n'))
                            new_dict["Metadata_%d" % counter]["PART NAMEs"] = a
                        if see and table:
                            a.append(element.get_text().strip().split('\n'))
                            #new_dict["Metadata_%d" %counter]["SEE PAGE"]=a

    #close open handles
    converter.close()
    fake_file_handle.close()

    if new_dict:
        return new_dict
Example #58
0
 def __init__(self):
     self.__resources_manager = PDFResourceManager()
     self.__params_manager = LAParams()
     self.__aggregator = PDFPageAggregator(rsrcmgr=self.__resources_manager, laparams=self.__params_manager)
     self.__interpreter = PDFPageInterpreter(rsrcmgr=self.__resources_manager, device=self.__aggregator)
     self.__analyzer = None
Example #59
0
class PdfParser():
    """
    PaperForSaveに対応し,全てのPdfParserはこのクラスを継承する
    """
    def __init__(self,
                 conference_name,
                 start_patterns={"all": re.compile(".*")},
                 end_patterns={"all": None},
                 title_position_number=2,
                 parse_page_numbers=[0],
                 column_number=2,
                 paper_data_class=PaperForSave()):
        """
        Parameters
        ----------
        conference_name: str
            学会や論文集の名前
        start_patterns: dict of patterns
            Paperオブジェクトに保持するテキストの開始位置の辞書
        end_patterns: dict of pattrens
            Paperオブジェクトに保持するテキストの終了位置の辞書,Noneは最後まで
        title_position_number: int
            titleが与えられるtextboxのインデックス(ソート後)
        parse_page_numbers: list of int
            パースするページのリスト,Noneは最後まで
        paper_data_class: Paper class
            ペーパークラスのオブジェクトをストラテジーとして直接与える.
        """

        self.conference_name = conference_name

        if set(start_patterns.keys()) != set(end_patterns.keys()):
            raise ValueError(
                "start patterns and eend patterns are not correspondding")

        self.title_position_number = title_position_number
        self.parse_page_numbers = parse_page_numbers
        self.column_number = column_number

        self.paper_data_class = paper_data_class

        self.start_patterns = start_patterns
        self.end_patterns = end_patterns

        # パースに必要なクラスの作成
        # Layout Analysisのパラメーターを設定。縦書きの検出を有効にする。
        laparams = LAParams(detect_vertical=True)

        # 共有のリソースを管理するリソースマネージャーを作成。
        resource_manager = PDFResourceManager(caching=False)

        # ページを集めるPageAggregatorオブジェクトを作成。
        self.device = PDFPageAggregator(resource_manager, laparams=laparams)

        # Interpreterオブジェクトを作成。
        self.interpreter = PDFPageInterpreter(resource_manager, self.device)

        if column_number == 1:
            self.SortFuncClass = SortTextbox  # クラスを変数として保持
        elif column_number == 2:
            self.SortFuncClass = SortTextbox2Column
        else:
            raise ValueError("The column rather than two is not defined")

    def parse(self, pdf_file_path):
        """
        オーバーライドは原則禁止
        """
        self.pdf_file_name = str(pdf_file_path.stem)  # 内部メソッドからの参照用

        with open(pdf_file_path, "rb") as f:

            parse_text = ""
            parse_text_flag = False  # このフラッグがTrueである部分を序論とする

            for page in PDFPage.get_pages(f, pagenos=self.parse_page_numbers):
                self.interpreter.process_page(page)  # ページを処理する。
                layout = self.device.get_result()  # LTPageオブジェクトを取得。
                text_boxes = find_textboxes_recursively(layout)

                # text_boxの座標値毎にソート,複数キーのソート
                # 少なくともこのページは全て読み込む必要があるため,非効率
                sort_func = self.SortFuncClass(layout_x0=layout.x0,
                                               layout_x1=layout.x1)
                text_boxes.sort(key=sort_func)

                info_dict = self.parse_info()
                paper = self.paper_data_class.parse_by_textboxes(
                    text_boxes, info_dict)

        return paper

    def parse_info(self):
        """
        Paperオブジェクトによって要オーバーライド
        """
        info_dict = {}
        info_dict["conf_name"] = self.conference_name
        info_dict["pdf_name"] = self.pdf_file_name
        info_dict["start_patterns"] = self.start_patterns
        info_dict["end_patterns"] = self.end_patterns
        info_dict["title_position_number"] = self.title_position_number
        return info_dict
def pdf2String(request):
    if request.method == 'POST':
        print("pdf2String start!")
        myFile = request.FILES.get("pdf2trans",
                                   None)  # 获取上传的文件,如果没有文件,则默认为None
        print(myFile)
        transfered_str = '文件转换失败'
        if not myFile:
            return render(request, 'pdf2String.html',
                          {'transfered_str': transfered_str})
        try:
            transfered_str = ''
            from pdfminer.pdfparser import PDFParser, PDFDocument
            from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
            from pdfminer.converter import PDFPageAggregator
            from pdfminer.layout import LTTextBoxHorizontal, LAParams
            from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
            praser = PDFParser(myFile)
            # 创建一个PDF文档
            doc = PDFDocument()
            # 连接分析器 与文档对象
            praser.set_document(doc)
            doc.set_parser(praser)
            # 提供初始化密码
            # 如果没有密码 就创建一个空的字符串
            doc.initialize()
            # 检测文档是否提供txt转换,不提供就忽略
            if not doc.is_extractable:
                raise PDFTextExtractionNotAllowed
            else:
                # 创建PDf 资源管理器 来管理共享资源
                rsrcmgr = PDFResourceManager()
                # 创建一个PDF设备对象
                laparams = LAParams()
                device = PDFPageAggregator(rsrcmgr, laparams=laparams)
                # 创建一个PDF解释器对象
                interpreter = PDFPageInterpreter(rsrcmgr, device)

                # 循环遍历列表,每次处理一个page的内容
                for page in doc.get_pages():  # doc.get_pages() 获取page列表
                    interpreter.process_page(page)
                    # 接受该页面的LTPage对象
                    layout = device.get_result()
                    # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
                    for x in layout:
                        if (isinstance(x, LTTextBoxHorizontal)):
                            results = x.get_text()
                            # print(results)
                            transfered_str = transfered_str + results

            #得到文字后进一步的处理
            # re.sub(r'\r\n\s','许相虎',transfered_str)
            print("before change: \n" + transfered_str)
            transfered_str = re.sub(r'\n\s', '许相虎', transfered_str)
            transfered_str = re.sub(r'\s\n', '许相虎', transfered_str)
            print("after change: \n" + transfered_str)
            transfered_str = re.sub(r'\n|\r', '', transfered_str)
            transfered_str = re.sub(r'许相虎', '\n', transfered_str)
            transfered_str = re.sub(r'\s{4,}', '\n', transfered_str)
            print("finally: \n" + transfered_str)

            return render(request, 'pdf2String.html',
                          {'transfered_str': transfered_str})
        except:
            return render(request, 'pdf2String.html',
                          {'transfered_str': transfered_str})
    else:
        return render(request, 'pdf2String.html')