def getPageLayouts(f1):
    '''Takes a pdf file object, f1, extracts the text-like objects, and returns'''
    try:
        '''The parser and doc pair for a "pipe" of sorts'''
        with open(fpath, 'rb') as f1:
            parser = PDFParser(f1)
            doc = PDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
            doc.initialize(pss_wd)

            # can we extract text?
            if doc.is_extractable:
                rsrcmgr = PDFResourceManager()
                laparams = LAParams()
                device = PDFPageAggregator(rsrcmgr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)

                page_layouts = []
                for page in doc.get_pages():
                    '''
                    I *think* we're actually calling on fp here, and not some
                    stored data; the idea is that .pdf files are "too big and
                    complicated" to load all at once, so why not just parse
                    what you need when you need it?
                    '''
                    interpreter.process_page(page)
                    # receive the LTPage object for the page
                    page_layouts.append(device.get_result())
    except IOError:
        raise IOError, "issue with loading file, please try again"
    finally:
        f1.close()
        return page_layouts
Example #2
0
    def parse_pdf (self):
        self.report = Report (self.logger)
        fp = StringIO(self.raw_pdf)
        parser = PDFParser(fp)
        doc = PDFDocument()
        parser.set_document(doc)
        try:
            doc.set_parser(parser)
            doc.initialize('')
            if not doc.is_extractable:
                raise RuntimeError("PDFTextExtractionNotAllowed")

            rsrcmgr = PDFResourceManager()
            laparams = LAParams(
                                char_margin=0.01,       # default 1.0
                                word_margin=0.2,       # default 0.2
                                line_margin=0.3,       # default 0.3
                                line_overlap=0.5       # default 0.5
                               )
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)

            page_y_offset = 0
            pages = []
            for page in doc.get_pages():
                pages.append(page)
            pages.reverse()

            # Detect pdf format
            for page in pages:
                interpreter.process_page(page)
                layout = device.get_result()
                if self.find_pdf_text(layout, "Job Start Date:"):
                    sheet = Sheet2()
                    break
            else:
                sheet = Sheet1()
                #sheet = Sheet2()  #  TESTING

            for page in pages:
                interpreter.process_page(page)
                layout = device.get_result()
                sheet.add_ltcontainer (layout, page_y_offset)
                page_y_offset += layout.y1

            self.report.extract_data (sheet)
        except Exception:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            trace = traceback.format_exception (exc_type, exc_value, exc_traceback)
            self.logger.error('%s'%''.join(trace))

        if self.logger.has_error():
            return None
        else:
            return self.report
def readPdf(file):
    # Open a PDF file.
    fp = open(file, 'rb')

    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser)
    
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
        
    # Set parameters for analysis.
    laparams = LAParams(line_margin=0.1)
    
    pages = []

    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in islice(PDFPage.create_pages(document), 2):
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        pages.append(layout)
        
    return pages
Example #4
0
def convert_pdf_table(pdf_file):
    pdf_file = open(pdf_file, 'rb')
    parser = PDFParser(pdf_file)
    document = PDFDocument(parser)

    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    rsrcmgr = PDFResourceManager()

    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)

    interpreter = PDFPageInterpreter(rsrcmgr, device)

    table = []
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page) 
        layout = device.get_result()
        page_table = tabulate_page(layout)
        header = page_table[0]
        rows = page_table[1:]
        for row in rows:
            row_dict = {}
            for item, detail in enumerate(row):
                if detail != '':
                    row_dict[header[item].lower()] =  detail
            table.append(row_dict)           
                
    return table
Example #5
0
def parsepdf(filename):
    fp = open(filename, 'rb')
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    # Create a PDF device object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    found_randers = False
    found_aarhus = False
    _randers = []
    headings = [u'Ledige lejligheder\n',u'afd. adresse\n',u'rum m2\n',u'leje \n',
                u'a\xb4c varme a\xb4c vand\n',u'indskud\n',u'ledig pr.\n',u'bem\xe6rkning\n'
                ]
    location_map = OrderedDict()
    header_ycord = []
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        layout = device.get_result()

        for obj in layout._objs:
            # print obj
            if isinstance(obj,LTTextBoxHorizontal):
                for o in obj._objs:
                    y0 = o.y0
                    # print o
                    if isinstance(o,LTTextLineHorizontal) and obj.get_text() not in headings:

                        if y0 not in header_ycord:
                            if y0 in location_map :
                                objs = location_map.get(y0)
                            else:
                                objs = []
                            string_val = o.get_text().encode('ascii', 'ignore')
                            string_val = string_val.replace('\n','')
                            objs.append(string_val)
                            location_map.__setitem__(y0,objs)
                    else :
                        if y0 not in header_ycord:
                            header_ycord.append(y0)





    for key in location_map:
        print '**************************'
    #     # print key
        print location_map.get(key)
        print '**************************'
    print 'Total Rowss = %s'%len(location_map)
Example #6
0
def convertWithCoordinatesPara(fname, pages=None):
  fontSize = {}
  pdfText = []

  print fname
  if not pages:
    pagenums = set()
  else:
    pagenums = set(pages)

  infile = file(fname, 'rb')

  parser = PDFParser(infile)
  document = PDFDocument(parser)

  laparams = LAParams()

  manager = PDFResourceManager()
  device = PDFPageAggregator(manager, laparams=laparams)

  interpreter = PDFPageInterpreter(manager, device)

  for page in PDFPage.create_pages(document):
    interpreter.process_page(page)
    layout = device.get_result()
    
    parse_obj_para(layout._objs, fontSize, pdfText)

  return {'fontSize': fontSize, 'pdfText': pdfText}
def getTemPdf(file):
        parser = PDFParser(file)
        document = PDFDocument(parser)
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        else:
            f = open('result\\' + 'tem_pdf', 'w')
            f.write(''.encode('utf-8'))
            f.close()
            rsrcmgr = PDFResourceManager()
            laparams = LAParams()
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.create_pages(document):
                interpreter.process_page(page)
                layout = device.get_result()
                for x in layout:
                    if isinstance(x, LTTextBoxHorizontal):
                        with open('result\\' + 'tem_pdf', 'a') as f:
                            sentence = x.get_text()
                            f.write(sentence.encode('utf-8') + '\n')
                            f.close()

            return_tem_pdf = open('result\\' + 'tem_pdf', 'rb')
            return return_tem_pdf
Example #8
0
    def _GetFromPdf(self,pdf):
        '''
        参考文档http://www.unixuser.org/~euske/python/pdfminer/programming.html
        '''
        pass
        fp = open(pdf, 'rb')
        #用文件对象来创建一个pdf文档分析器
        parser = PDFParser(fp)
        # 创建一个  PDF 文档
        doc = PDFDocument(parser)
        # 连接分析器 与文档对象
        parser.set_document(doc)
        # 检测文档是否提供txt转换,不提供就忽略
        if not doc.is_extractable:
            raise PDFTextExtractionNotAllowed

        # 创建PDf 资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
            # receive the LTPage object for the page.
            layout = device.get_result()
            for x in layout:
                if(isinstance(x, LTTextContainer)):
                    print x.get_text()

        pass
Example #9
0
def pdf2txt(data,save_path):

    parser = PDFParser(data)

    document = PDFDocument(parser)

    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        #
        rsrcmgr = PDFResourceManager()

        laparams = LAParams()

        device = PDFPageAggregator(rsrcmgr,laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr,device)
        #
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
            for line in layout:
                try:
                    if(isinstance(line,LTTextBoxHorizontal)):
                        with open('%s'%(save_path),'a') as f:
                            f.write(line.get_text().encode('utf-8') + '\n')
                except:
                    print "failed!"
Example #10
0
def extract_text_elements_from_pdf(path, j=nulljob):
    """Opens a PDF and extract every element that is text based (LTText).
    """
    fp = open(path, 'rb')
    doc = PDFDocument(caching=True)
    parser = PDFParser(fp)
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize()
    rsrcmgr = PDFResourceManager()
    laparams = LAParams(all_texts=True, paragraph_indent=5, heuristic_word_margin=True)
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = []
    all_elements = []
    enumerated_pages = list(enumerate(doc.get_pages()))
    progress_msg = "Reading page %i of %i"
    for pageno, page in j.iter_with_progress(enumerated_pages, progress_msg):
        interpreter.process_page(page)
        page_layout = device.get_result()
        pages.append(Page(page_layout.width, page_layout.height))
        textboxes = extract_textboxes(page_layout)
        elements = [create_element(box) for box in textboxes]
        merge_oneletter_elems(elements)
        for i, elem in enumerate(elements):
            elem.page = pageno
            elem.order = i
        all_elements += elements
    return pages, all_elements
Example #11
0
def get_result_from_file(filename):
    from pdfminer.pdfparser import PDFParser
    from pdfminer.pdfdocument import PDFDocument
    from pdfminer.pdfpage import PDFPage
    from pdfminer.pdfpage import PDFTextExtractionNotAllowed
    from pdfminer.pdfinterp import PDFResourceManager
    from pdfminer.pdfinterp import PDFPageInterpreter
    from pdfminer.converter import PDFPageAggregator
    from pdfminer.layout import LAParams

    result = {"filename": filename, "pages": []}
    fp = open(filename, "rb")
    parser = PDFParser(fp)
    document = PDFDocument(parser)
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    laparams.char_margin = 2.0
    laparams.detect_vertical = True
    laparams.line_margin = 1.0
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    page_index = 0
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        layout = device.get_result()
        bounding_box = get_bounding_box(layout)
        labels = get_text_labels(layout)
        result["pages"].append({"index": page_index, "bounding_box": bounding_box, "labels": labels})
        page_index += 1
    fp.close()
    return result
Example #12
0
def get_num(source_file):
    fp = open(source_file,'rb')
    # fp = StringIO(source_file)
    #创建一个PDF文档解析器对象
    parser = PDFParser(fp)
    #创建一个PDF文档对象存储文档结构
    #提供密码初始化,没有就不用传该参数
    document = PDFDocument(parser)
    #检查文件是否允许文本提取
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    #创建一个PDF资源管理器对象来存储共享资源
    rsrcmgr = PDFResourceManager()
    #创建一个pdf设备对象
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    #创建一个PDF解析器对象
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    #处理文档当中的每个页面
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        layout = device.get_result()

        for n,l in enumerate(layout):
            if isinstance(l,LTTextBox):
                text = l.get_text()

                if n == 0:
                    pass

                elif n == 1:
                    num = text.split(":")[1].replace("\n",'')
                    return num
                else:
                    break
Example #13
0
def extract_layout_by_page(pdf_path):
    """
    See:
    - https://euske.github.io/pdfminer/programming.html
    - http://denis.papathanasiou.org/posts/2010.08.04.post.html
    """
    laparams = LAParams()

    fp = open(pdf_path, 'rb')
    parser = PDFParser(fp)
    document = PDFDocument(parser)

    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    rsrcmgr = PDFResourceManager()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    layouts = []
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        layouts.append(device.get_result())

    return layouts
Example #14
0
class Document(object):

    def __init__(self, filename):
        self.file = open(filename, 'rb')

        rsrcmgr = PDFResourceManager()
        self.device = PDFPageAggregator(rsrcmgr)
        self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)

    def extract(self):
        pages = {}

        for page in PDFPage.get_pages(self.file):
            self.interpreter.process_page(page)
            layout = self.device.get_result()
            pages[layout.pageid] = layout

        return pages

    @staticmethod
    def to_markdown(path, template):
        doc = Document(path)
        out = os.path.splitext(path)[0] + '.md'

        with open(out, 'w') as output:
            for page_num, page in doc.extract().items():
                page_element = Element()
                for element in page_element.parse(page, page_num, template):
                    output.write(template.to_markdown(element).encode('utf-8'))
def get_layout(url, pages=None):
    """
    The layout is an object of pdfminer corresponding to the tree structure of
    a pdf. More information about the layout here:
    http://www.unixuser.org/~euske/python/pdfminer/programming.html
    :param url: path (str) of the pdf file to be analysed
    :param pages: list (int) of pages of which you want the layout.
    Beware
    that
    the first page of the pdf correspond to number 0, even if its id is 1
    :return layouts: List of layouts (One layout per page).
    """
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    # Set parameters for analysis.
    laparams = LAParams()
    manager = PDFResourceManager()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(manager, laparams=laparams)
    interpreter = PDFPageInterpreter(manager, device)
    layouts = []
    with open(url, 'rb') as infile:
        for page in PDFPage.get_pages(infile, pagenos=pagenums):
            interpreter.process_page(page)
            layouts.append(device.get_result())
    device.close()

    return layouts
def setup(path):
	# Open a PDF file.
	fp = open(path, 'rb')
	# Create a PDF parser object associated with the file object.
	parser = PDFParser(fp)
	# Create a PDF document object that stores the document structure.
	# Supply the password for initialization.
	document = PDFDocument(parser)
	# Check if the document allows text extraction. If not, abort.
	if not document.is_extractable:
	    raise PDFTextExtractionNotAllowed
	# Create a PDF device object.
	device = PDFPageAggregator(rsrcmgr, laparams=laparams)
	# Create a PDF interpreter object.
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	# Process each page contained in the document.

	# now extract dialogue from 
	for i, page in enumerate(PDFPage.create_pages(document)):
		# skip the title page
		if i > 0:
			# process page with interpreter
			interpreter.process_page(page)
			# get layout info
			layout = device.get_result()
			# iterate through layout objects
			for obj in layout:
				# we only want to bother with LTTextBox and LTTextLine
				if isinstance(obj, LTTextBox) or isinstance(obj, LTTextLine):
					# only extract text segments within a certain margin range
					if obj.bbox[0] > DIALOGUE_BBOX_MIN and obj.bbox[0] < DIALOGUE_BBOX_MAX:
						# need to convert unicode characters
						converted = unicodedata.normalize('NFKD', obj.get_text()).encode('ascii', 'ignore')
						print(converted)
def pdf_to_text(page_object):
    parser = PDFParser(page_object)
    # Create a PDF document object that stores the document structure
    doc = PDFDocument(parser)
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.initialize('')
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF page aggregator object
    device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    text_content = []
    # i = page number #without this it doesn't work
    # page are items in page
    for i, page in enumerate(PDFPage.create_pages(doc)):
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        for object in layout:
            if isinstance(object, LTTextBox) or isinstance(object, LTTextLine):
                trial = []
                trial.append(object.get_text())
                for word in trial:
                    text_content.append(word)                    
    return text_content
Example #18
0
def parsing(pdfPath, pdfFileName):
    fp = open(pdfPath + '\\' + pdfFileName, 'rb')
    parser      = PDFParser(fp)
    document    = PDFDocument(parser)
    rsrcmgr     = PDFResourceManager()
    laparams    = LAParams()
    device      = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pathOut     = r'C:\Projects\PDFparser\pageLayout'
    layoutName  = pdfFileName.split('.', 1)[0].replace(' ','_')
    
    # Create a folder for each pdf file layout
    if not os.path.exists(layoutName):
        os.makedirs(pathOut + '\\' + layoutName)
    for pageNum, page in enumerate(PDFPage.create_pages(document)):
        interpreter.process_page(page)
        layout = device.get_result()
        parse_layout(layout)

        # .pmlo stands for PDFminer Layout
        fileOut = open(pathOut + '\\' + layoutName + '\\' + str(pageNum + 1) + '.pmlo','w')
        
        for line in layoutStream:
            fileOut.write(str(line))

        #Start a new page
        del layoutStream[:]

    fp.close()  
    def Parse(self):
        # 先看是否有 cache,以及日期是否夠新
        if not os.path.exists(parseCacheDir):
            os.makedirs(parseCacheDir)
        cacheFile = os.path.join(parseCacheDir, os.path.basename(self.pdfFileName) + '.cache')
        foundCache = (os.path.isfile(cacheFile) and \
                      os.path.getsize(cacheFile) > 0 and \
                      os.path.getmtime(cacheFile) > os.path.getmtime(self.pdfFileName))
        if (foundCache):
            fp = open(cacheFile, 'rb')
            self.RawData = pickle.load(fp)
            fp.close()
        else:
            fp = open(self.pdfFileName, 'rb')
            for page in PDFPage.get_pages(fp, None, maxpages=1):
                rsrcmgr = PDFResourceManager()
                device = PDFPageAggregator(rsrcmgr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                interpreter.process_page(page)
                layout = device.get_result()
                self.__readobj(layout._objs)
                for category in self.RawData.values():
                    self.__reverseYaxis(category, layout.bbox[3])
                cacheFp = open(cacheFile, 'wb')
                pickle.dump(self.RawData, cacheFp)
                cacheFp.close()
            fp.close()

        self.__calculateBoundary()
        self.__assignCharsAndLinesToCell()
        self.__processCells()
        return (self.effectiveFrom, self.__getResult())
Example #20
0
def get_layout(path):
	'''returns a list of every character in the document as well as its location'''

	rsrcmgr = PDFResourceManager()
	retstr = StringIO()
	codec = 'utf-8'
	laparams = LAParams()

	fp = file(path, 'rb')
	password = ""
	maxpages = 0
	caching = True
	pagenos=set()

	layout = []
	device = PDFPageAggregator(rsrcmgr, laparams=laparams)
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
		interpreter.process_page(page)
		layout.append(  device.get_result()  )
	fp.close()
	device.close()
	retstr.close()

	return layout
Example #21
0
def parse_pdf(pdf_url):

    remote_file = urllib.request.urlopen(pdf_url).read()
    memory_file = io.BytesIO(remote_file)
    parser = PDFParser(memory_file)
    doc = PDFDocument()
    parser.set_document(doc)
    #Warning sometimes, error in pdf?
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    ret = []
    # Process each page contained in the document.
    for pageIdx, page in enumerate(doc.get_pages()):
        ret.append([])
        interpreter.process_page(page)
        layout = device.get_result()
        for idx, lt_obj in enumerate(layout):
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                if len(lt_obj.get_text().strip()) > 0:
                    ret[pageIdx].append((lt_obj.get_text().splitlines()))
    return ret
Example #22
0
def pdf_to_txt(in_file):
	""" turn a PDF file to a TXT file (roughly processed)
	"""
	# Open a PDF file.
	fp = open(in_file, 'rb')
	# Create a PDF parser object associated with the file object.
	parser = PDFParser(fp)
	# Create a PDF document object that stores the document structure.
	document = PDFDocument(parser)
	# Check if the document allows text extraction. If not, abort.
	if not document.is_extractable:
		raise PDFTextExtractionNotAllowed
	# Set parameters for analysis.
	laparams = LAParams()
	# Create a PDF resource manager object that stores shared resources.
	rsrcmgr = PDFResourceManager()
	# Create a PDF page aggregator object.
	device = PDFPageAggregator(rsrcmgr, laparams=laparams)
	# Create a PDF interpreter object.
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	for page in PDFPage.create_pages(document):
		interpreter.process_page(page)
		# Receive the LTPage object for the page.
		layout = device.get_result()
		for klass in layout:
			if isinstance(klass, LTTextBoxHorizontal):
				out_file = in_file[:-3] + 'txt'
				with open(out_file, 'a') as dst_file:
					text = klass.get_text().encode('utf-8')
					dst_file.write(text + '\n')
	return None
Example #23
0
def extract_pdf(path, languages=None):
    """ Extract content from a PDF file. This will attempt to use PyPDF2
    to extract textual content first. If none is found, it'll send the file
    through OCR. """
    with open(path, 'rb') as fh:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        parser = PDFParser(fh)
        doc = PDFDocument(parser, '')
        result = {'pages': []}
        if len(doc.info):
            for k, v in doc.info[-1].items():
                k = k.lower().strip()
                if k != 'pages':
                    result[k] = safe_text(v)

        if not doc.is_extractable:
            log.warning("PDF not extractable: %s", path)
            return result

        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
            layout = device.get_result()
            text = _convert_page(layout, languages)
            result['pages'].append(text)
        device.close()
        return result
Example #24
0
def parse_pdf(fname):
    fp = open(fname, 'rb')
    # 来创建一个pdf文档分析器
    parser = PDFParser(fp)
    # 创建一个PDF文档对象存储文档结构
    document = PDFDocument(parser)
    # 检查文件是否允许文本提取
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建一个PDF资源管理器对象来存储共赏资源
        rsrcmgr=PDFResourceManager()
        # 设定参数进行分析
        laparams=LAParams()
        # 创建一个PDF设备对象
        # device=PDFDevice(rsrcmgr)
        device=PDFPageAggregator(rsrcmgr,laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter=PDFPageInterpreter(rsrcmgr,device)
        # 处理每一页

        contents = []
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout=device.get_result()
            for x in layout:
                if(isinstance(x, LTTextBoxHorizontal)):
                    content = x.get_text().strip()

                    # print type(content)
                    # print content
                    if content:
                        contents.append(content)
        return contents
Example #25
0
def extract_text(doc, config):
    rsrcmanager = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmanager, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmanager, device)

    pages = doc.get_pages()

    if config.page is not None:
        # take only 1 page
        # note: Use page-1 because of 0 index (where pdfs index from 1)
        pages = [next(itertools.islice(pages, config.page - 1, None), None)]

    for page in pages:
        interpreter.process_page(page)
        layout = device.get_result()

        text = []
        for obj in layout:
            if isinstance(obj, LTTextBox):
                for line in obj:
                    # coord = ((line.x0, line.y0), (line.x1, line.y1))
                    text.append(line)
            elif isinstance(obj, LTTextLine):
                assert False, "Expected no lines at top of tree"
            else:
                pass

        yield text
Example #26
0
    def generateFileContent(self):

        import tempfile
        import urllib

        abbreviationsPdfUrl = u"http://www.realacademiagalega.org/c/document_library/get_file?uuid=f29e6ce1-9ac5-42e3-8c15-73c4b9b5f48b&groupId=10157"
        temporaryFile = tempfile.NamedTemporaryFile()
        urllib.urlretrieve(abbreviationsPdfUrl, temporaryFile.name)

        entries = set()
        fileObject = open(temporaryFile.name, "rb")
        parser = PDFParser(fileObject)
        document = PDFDocument(parser)
        resourceManager = PDFResourceManager()
        device = PDFPageAggregator(resourceManager)
        interpreter = PDFPageInterpreter(resourceManager, device)
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
            objects = [object for object in layout if not isinstance(object, LTRect) and not isinstance(object, LTCurve)]
            params = LAParams()
            for line in layout.group_objects(params, objects):
                text = line.get_text()
                if u":" in text:
                    entry = text.split(u":")[0]
                    entry = entry.strip()
                    entry = entry.replace(u"..", ".")
                    entries.add(entry)

        dictionary  = u"# Abreviaturas empregadas no Dicionario da Real Academia Galega\n"
        dictionary += u"# http://www.realacademiagalega.org/abreviaturas\n"
        dictionary += u"\n"
        for entry in formatEntriesForDictionary(entries, u"abreviatura"):
            dictionary += entry
        return dictionary
Example #27
0
def parse_pages(pdf_buffer, password):
    """
    With an PDF buffer object, get the pages, parse each one, and return the entire pdf text
    """
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(pdf_buffer)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser, password)

    resource_manager = PDFResourceManager()
    la_params = LAParams()
    device = PDFPageAggregator(resource_manager, laparams=la_params)
    interpreter = PDFPageInterpreter(resource_manager, device)

    text_content = []  # a list of strings, each representing text collected from each page of the doc
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        # receive the LTPage object for this page
        layout = device.get_result()
        # layout is an LTPage object which may contain
        #  child objects like LTTextBox, LTFigure, LTImage, etc.
        text_content.append(parse_lt_objects(layout._objs))  # pylint: disable=protected-access

    return text_content
Example #28
0
def pdf2text(path,save_file):
    '''
    读取本地pdf文档,并保存到txt文件中
    :param path: 源pdf 文件
    :param save_file:  保存txt文件名,无路径则保存到脚本同一目录
    :return: 无
    '''

    #创建分析器
    parser = PDFParser(path)
    #文档存储结构
    document = PDFDocument(parser)

    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        rsrcmgr = PDFResourceManager()

        laparams = LAParams()

        device = PDFPageAggregator(rsrcmgr,laparams = laparams)

        interpreter = PDFPageInterpreter(rsrcmgr,device)
        #处理每一页
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)

            layout = device.get_result()

            for line in layout:
                if(isinstance(line,LTTextBoxHorizontal)):
                    with open('%s' %(save_file),'a') as f:
                        f.write(line.get_text().encode('utf-8'))
Example #29
0
def Layout():
    # Set parameters for analysis.
    with open('/home/chris/Documents/Literature/Donghun_ACSNano_2014', 'rb') as fp:
        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)
        # Create a PDF document object that stores the document structure.
        # Supply the password for initialization.
        document = PDFDocument(parser)
        # Check if the document allows text extraction. If not, abort.
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()
        print rsrcmgr
           
        laparams = LAParams()
        # Create a PDF page aggregator object.
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            # receive the LTPage object for the page.
            layout = device.get_result()
            
        return layout
Example #30
0
def read_invoice_pdfminer3k(pdfFile):
    fp = open(os.path.join(invoice_path + "\\" + pdfFile), "rb")

    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)

    doc.initialize("")
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()

    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Process each page contained in the document.
    invoice_text = ""
    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                invoice_text += lt_obj.get_text()

    # Extract client info from the string extracted from pdf
    client = extract_info(invoice_text, client_start, client_end)
    print("client :" + client)

    # Extract invoice no from the pdf file name
    invoice_no = extract_info(str(pdfFile), invoice_start, invoice_end)
    print("invoice no :" + invoice_no)

    # Pass the client info and invoice no to the method which writes to excel file
    write_excel(client, invoice_no)
Example #31
0
def pdfread(pdfPath):
    with open(pdfPath, 'rb') as fp:
        try:

            print(pdfPath)
            #用文件对象创建一个PDF文档分析器
            parser = PDFParser(fp)
            #创建一个PDF文档
            doc = PDFDocument()
            #分析器和文档相互连接
            parser.set_document(doc)
            doc.set_parser(parser)
            #提供初始化密码,没有默认为空
            doc.initialize()
            #检查文档是否可以转成TXT,如果不可以就忽略
            if not doc.is_extractable:
                raise PDFTextExtractionNotAllowed
            else:
                #创建PDF资源管理器,来管理共享资源
                rsrcmagr = PDFResourceManager()
                #创建一个PDF设备对象
                laparams = LAParams()
                #将资源管理器和设备对象聚合
                device = PDFPageAggregator(rsrcmagr, laparams=laparams)
                #创建一个PDF解释器对象
                interpreter = PDFPageInterpreter(rsrcmagr, device)
                allContent = ''
                last_para = ''
                result = ''
                for page in doc.get_pages():
                    interpreter.process_page(page)
                    #接收该页面的LTPage对象
                    layout = device.get_result()
                    #这里的layout是一个LTPage对象 里面存放着page解析出来的各种对象
                    #一般包括LTTextBox,LTFigure,LTImage,LTTextBoxHorizontal等等一些对像
                    #想要获取文本就得获取对象的text属性

                    for x in layout:
                        try:
                            if (isinstance(x, LTTextBoxHorizontal)):
                                result = x.get_text()
                                # 去掉pdf文件读取的各种换行符
                                result = result.replace('\n', '')
                                # 去掉参考文献引用
                                result = re.sub('\[(\d+\,* ?-?)+\]', '',
                                                result)
                                # 无序列表换行
                                result = result.replace('∙', '\n∙')
                                # 去掉参考文献
                                if re.findall(
                                        '^references?',
                                        last_para.lower().replace(
                                            ' ', '')) != [] or re.findall(
                                                '^references?',
                                                result.lower().replace(
                                                    ' ', '')) != []:
                                    return allContent
                                # 去掉页脚页码页眉以及内容过少的表格
                                if re.findall(
                                        '^Authorized licensed use limited to:',
                                        result) == [] and re.findall(
                                            '©', result) == [] and re.findall(
                                                'Publication date',
                                                result) == [] and re.findall(
                                                    '\d\:\d', result
                                                ) == [] and re.findall(
                                                    '(et al.)$', result
                                                ) == [] and len(result) > 5:
                                    allContent = allContent + '\n' + result
                                # print(result)
                        except Exception as e:
                            print(e)
                        last_para = result
                return allContent
        except Exception as e:
            print('文档读取失败:' + str(e))
Example #32
0
# 设定参数进行分析
laparams = LAParams()
laparams.char_margin = 1.0
laparams.word_margin = 1.0

# 创建一个PDF设备对象
# device=PDFDevice(rsrcmgr)
device = PDFPageAggregator(rsrcmgr, laparams=laparams)

# 创建一个PDF解释器对象
interpreter = PDFPageInterpreter(rsrcmgr, device)

extracted_text = ''

for page in document.get_pages():
    interpreter.process_page(page)

    # # 接受该页面的LTPage对象
    layout = device.get_result()  # return text image line curve
    for x in layout:
        if isinstance(x, LTText) or isinstance(x, LTTextBox) or isinstance(
                x, LTTextLine):
            if x.get_text().__contains__("实际控制人"):
                print(x.get_text())
                extracted_text += x.get_text()

with open('pdf_result.txt', "wb") as txt_file:
    txt_file.write(extracted_text.encode("utf-8"))

fp.close()
Example #33
0
def parse(_path):
    # 判断是不是本地文件
    result = re.match("http", _path)
    if result == None:
        fp = open(_path, 'rb')  # rb以二进制读模式打开本地pdf文件
    else:
        request = Request(url=_path,
                          headers={'User-Agent': random.choice(user_agent)
                                   })  # 随机从user_agent列表中抽取一个元素
        fp = urlopen(request)  #打开在线PDF文档

    # 用文件对象来创建一个pdf文档分析器
    praser_pdf = PDFParser(fp)

    # 创建一个PDF文档
    doc = PDFDocument()

    # 连接分析器 与文档对象
    praser_pdf.set_document(doc)
    doc.set_parser(praser_pdf)

    # 提供初始化密码doc.initialize("123456")
    # 如果没有密码 就创建一个空的字符串
    #这边是要进行解密的处理
    doc.initialize()

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()

        # 创建一个PDF参数分析器
        laparams = LAParams()
        # retstr = io.StringIO()

        # 创建聚合器
        # device = TextConverter(rsrcmgr, retstr,laparams=laparams)
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)

        # 创建一个PDF页面解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # 循环遍历列表,每次处理一页的内容
        # doc.get_pages() 获取page列表
        for page in doc.get_pages():
            # 使用页面解释器来读取
            interpreter.process_page(page)

            # 使用聚合器获取内容
            layout = device.get_result()

            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
            for out in layout:
                if (isinstance(out, LTTextBoxHorizontal)):
                    with open(
                            r'C:\Users\Administrator\Desktop\pdf测试\2014 Automatic Generation of the Domain Module from Electronic Textbooks.txt',
                            "a+",
                            encoding='utf-8') as f:

                        results = out.get_text()
                        print(results)
                        f.write(results + '\n')
        print("----转换成功----")
Example #34
0
class PDF(Container):
    cached_properties = Container.cached_properties + ["_pages"]

    def __init__(self,
                 stream,
                 pages=None,
                 laparams=None,
                 precision=0.001,
                 password=""):
        self.laparams = None if laparams == None else LAParams(**laparams)
        self.stream = stream
        self.pages_to_parse = pages
        self.precision = precision
        rsrcmgr = PDFResourceManager()
        self.doc = PDFDocument(PDFParser(stream), password=password)
        self.metadata = {}
        for info in self.doc.info:
            self.metadata.update(info)
        for k, v in self.metadata.items():
            if hasattr(v, "resolve"):
                v = v.resolve()
            if type(v) == list:
                self.metadata[k] = list(map(decode_text, v))
            elif isinstance(v, PSLiteral):
                self.metadata[k] = decode_text(v.name)
            elif isinstance(v, bool):
                self.metadata[k] = v
            else:
                self.metadata[k] = decode_text(v)
        self.device = PDFPageAggregator(rsrcmgr, laparams=self.laparams)
        self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)

    @classmethod
    def open(cls, path, **kwargs):
        return cls(open(path, "rb"), **kwargs)

    def process_page(self, page):
        self.interpreter.process_page(page)
        return self.device.get_result()

    @property
    def pages(self):
        if hasattr(self, "_pages"): return self._pages

        doctop = 0
        pp = self.pages_to_parse
        self._pages = []
        for i, page in enumerate(PDFPage.create_pages(self.doc)):
            page_number = i + 1
            if pp != None and page_number not in pp: continue
            p = Page(self,
                     page,
                     page_number=page_number,
                     initial_doctop=doctop)
            self._pages.append(p)
            doctop += p.height
        return self._pages

    def close(self):
        self.stream.close()

    def __enter__(self):
        return self

    def __exit__(self, type, value, traceback):
        self.flush_cache()
        self.close()

    @property
    def objects(self):
        if hasattr(self, "_objects"): return self._objects
        all_objects = {}
        for p in self.pages:
            for kind in p.objects.keys():
                all_objects[kind] = all_objects.get(kind, []) + p.objects[kind]
        self._objects = all_objects
        return self._objects
Example #35
0
    def get_signatures_from_pdf(self, path, year=''):
        codec = 'utf-8'
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr=rsrcmgr, laparams=laparams)
        fp = open(path, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()
        pages = PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True)

        temp_pages = []
        for page in pages:
            temp_pages.append(page)

        if not temp_pages:
            return

        first_page = temp_pages[0]
        interpreter.process_page(first_page)
        first_page_layout = device.get_result()
        regulations = self.get_document_info(first_page_layout)

        ignore_words = ['ΟI ΥΠΟΥΡΓΟI', 'ΤΑ ΜΕΛΗ', 'ΟΙ ΥΠΟΥΡΓΟΙ']

        if not regulations:
            return

        signature_sets = []

        # Start from the last page until all the required signature sets are found
        for page in reversed(temp_pages):
            # Get the page's layout
            interpreter.process_page(page)
            page_layout = device.get_result()

            # Split text to line's for easier parsing
            text_lines = self.text_from_layout_objects(page_layout).split("\n")

            # Boolean indicating whether we are currently in a signature set
            # Save the data found
            search_active = False
            persons = []
            names = []
            roles = []
            role = ""
            temp_name = ""

            for line in text_lines:
                line = line.strip()
                if search_active:
                    if self.is_break_point(line):
                        for index, name in enumerate(names):
                            current_role = roles[index] if index < len(
                                roles) else ""
                            persons.append({
                                'name':
                                name,
                                'role':
                                Helper.format_role(current_role)
                            })

                        # Continue searching at next point
                        role = ""
                        temp_name = ""
                        search_active = False

                        if persons:
                            signature_sets.append(persons)
                            persons = []

                            # Break if enough signature sets have been found. Otherwise we'll continue looking for
                            # more in the same page.
                            if len(signature_sets) == len(regulations):
                                break

                    normal_line = Helper.normalize_greek_name(line)

                    if normal_line in ignore_words:
                        continue

                    if '***' in line and normal_line:
                        if role:
                            roles.append(role)
                            role = ""

                        names.append(normal_line)
                    else:
                        role += line


                elif (year in line and Helper.date_match(year).match(line)) \
                        or (str(int(year) - 1) in line and Helper.date_match(str(int(year) - 1)).match(line)) \
                        or line == 'Οι Υπουργοί':
                    search_active = True

            # If the end of page has been reached we save the signatures
            if persons:
                signature_sets.append(persons)

            # When we find enough signature sets we stop parsing pages.
            if len(signature_sets) == len(regulations):
                break

        # Merge regulations and signature sets
        for index, signatures in enumerate(reversed(signature_sets)):
            if index >= len(regulations):
                return
            regulations[index]['signatures'] = signatures

        return regulations
def get_data(setting) -> list:
    '''
    初期化
    '''
    # Layout Analysisのパラメーターを設定。縦書きの検出を有効にする。
    laparams = LAParams()
    # 共有のリソースを管理するリソースマネージャーを作成。
    resource_manager = PDFResourceManager()
    # ページを集めるPageAggregatorオブジェクトを作成。
    device = PDFPageAggregator(resource_manager, laparams=laparams)
    # Interpreterオブジェクトを作成。
    interpreter = PDFPageInterpreter(resource_manager, device)
    pdf_archive_dir = setup_pdf_archive_dir()

    # pdf data
    patient_datas_pdf = []
    patient_datas_old = []
    ret_data = []
    '''
    リスト取得
    '''
    befor_tb_avg = 10000  # (top + bottom)/2

    # 分割されたPDFを1つずつ読み込み・処理
    box_list = []
    for pdf_url in setting.pdf_urls:
        # pdf取得
        pdf_path = os.path.join(pdf_archive_dir, pdf_url.split('/')[-1])
        print(pdf_url)
        with urllib.request.urlopen(pdf_url) as u:
            with open(pdf_path, 'bw') as o:
                o.write(u.read())

        with open(pdf_path, 'rb') as f:
            for page in PDFPage.get_pages(f):
                interpreter.process_page(page)
                layout = device.get_result()

                # ページ内のテキストボックスのリストを取得する。
                boxes = find_textboxes_recursively(layout)

                # テキストボックスの左上の座標の順でテキストボックスをソートする。
                # y1(Y座標の値)は上に行くほど大きくなるので、正負を反転させている。
                boxes.sort(key=lambda b: (-b.y1, b.x0))

                for box in boxes:
                    if is_skip(box.get_text()) is True:
                        if box.get_text().find('#N/A') != -1:
                            box_list = []
                        continue
                    temp_tb_avg = (box.y1 + box.y0) / 2

                    if 15 < befor_tb_avg - temp_tb_avg or befor_tb_avg - temp_tb_avg < -15:
                        box_list.sort(key=lambda b: (b.x0))
                        if len(box_list) == 0:
                            befor_tb_avg = temp_tb_avg
                            box_list = []
                        elif box_list[0].get_text().find(
                                '-1') != -1 or box_list[0].get_text().find(
                                    '○') != -1 or box_list[0].get_text().find(
                                        '(cid:16089)1') != -1:
                            befor_tb_avg = temp_tb_avg
                            box_list = []
                        else:
                            temp_pd = patient_data(box_list)
                            temp_pd.parse_line()
                            if temp_pd.is_error is False:
                                patient_datas_pdf.append(temp_pd)
                                befor_tb_avg = temp_tb_avg
                                box_list = []
                            else:
                                print('error')
                                print(box_list)
                                befor_tb_avg = temp_tb_avg
                                box_list = []
                    befor_tb_avg = temp_tb_avg

                    box_list.append(box)

        # 前のPDFファイルで残されたデータの処理
        box_list.sort(key=lambda b: (b.x0))
        temp_pd = patient_data(box_list)
        temp_pd.parse_line()
        if temp_pd.is_error is False:
            patient_datas_pdf.append(temp_pd)
            befor_tb_avg = temp_tb_avg
            box_list = []
        else:
            print('error')
            print(box_list)
            befor_tb_avg = temp_tb_avg
            box_list = []

    # 最後でデータを処理
    if len(box_list) == 0:
        befor_tb_avg = temp_tb_avg
        box_list = []
    else:
        temp_pd = patient_data(box_list)
        temp_pd.parse_line()
        if temp_pd.is_error is False:
            print(temp_pd.no)
            patient_datas_pdf.append(temp_pd)
            befor_tb_avg = temp_tb_avg
            box_list = []
        else:
            print('error')
            print(box_list)
            befor_tb_avg = temp_tb_avg
            box_list = []
    patient_datas_pdf.reverse()

    # 閲覧不可になったデータの処理
    old_no_range = list(range(1, 12656))
    row_datas = []
    patient_datas_old = []
    with open(
            os.path.dirname(os.path.abspath(__file__)) + "/data/row_data.json",
            "r") as f:
        row_datas = json.load(f)
    for row_data in row_datas:
        if int(row_data['No']) not in old_no_range:
            continue
        temp_patient_data = patient_data()
        temp_patient_data.no = row_data['No']
        temp_patient_data.revealed_dt = dt.strptime(row_data['revealed_dt'],
                                                    '%Y-%m-%d')
        temp_patient_data.old = row_data['old']
        temp_patient_data.sex = row_data['sex']
        temp_patient_data.job = row_data['job']
        temp_patient_data.symptom = row_data['symptom']
        if row_data['appearance_dt'] == '':
            temp_patient_data.appearance_dt = None
        else:
            temp_patient_data.appearance_dt = dt.strptime(
                row_data['appearance_dt'], '%Y-%m-%d')
        if row_data['status_id'] in [1, 2, 3, 4]:
            temp_patient_data.status_id = 7
        else:
            temp_patient_data.status_id = row_data['status_id']
        patient_datas_old.append(temp_patient_data)

    patient_datas = patient_datas_old + patient_datas_pdf
    patient_datas_sorted = sorted(patient_datas, key=lambda x: int(x.no))

    for patient in patient_datas_sorted:
        ret_data.append(patient.export_dict())
    return ret_data
Example #37
0
class GetPic:

    def __init__(self, filename, password=''):
        """
        初始化
        :param filename: pdf路径
        :param password: 密码
        """
        with open(filename, 'rb') as file:
            # 创建文档分析器
            self.parser = PDFParser(file)
        # 创建文档
        self.doc = PDFDocument()
        # 连接文档与文档分析器
        self.parser.set_document(self.doc)
        self.doc.set_parser(self.parser)
        # 初始化, 提供初始密码, 若无则为空字符串
        self.doc.initialize(password)
        # 检测文档是否提供txt转换, 不提供就忽略, 抛出异常
        if not self.doc.is_extractable:
            raise PDFTextExtractionNotAllowed
        else:
            # 创建PDF资源管理器, 管理共享资源
            self.resource_manager = PDFResourceManager()
            # 创建一个PDF设备对象
            self.laparams = LAParams()
            self.device = PDFPageAggregator(self.resource_manager, laparams=self.laparams)
            # 创建一个PDF解释器对象
            self.interpreter = PDFPageInterpreter(self.resource_manager, self.device)
            # pdf的page对象列表
            self.doc_pdfs = list(self.doc.get_pages())
        #  打开PDF文件, 生成一个包含图片doc对象的可迭代对象
        self.doc_pics = fitz.open(filename)

    def to_pic(self, doc, zoom, pg, pic_path):
        """
        将单页pdf转换为pic
        :param doc: 图片的doc对象
        :param zoom: 图片缩放比例, type int, 数值越大分辨率越高
        :param pg: 对象在doc_pics中的索引
        :param pic_path: 图片保存路径
        :return: 图片的路径
        """
        rotate = int(0)
        trans = fitz.Matrix(zoom, zoom).preRotate(rotate)
        pm = doc.getPixmap(matrix=trans, alpha=False)
        path = os.path.join(pic_path, str(pg)) + '.png'
        pm.writePNG(path)
        return path

    def get_pic_loc(self, tmp='', picPath=''):
        """
        获取单页中图片的位置,输出文本
        :param doc: pdf的doc对象
        :return: 返回一个list, 元素为图片名称和上下y坐标元组组成的tuple. 当前页的尺寸
        """

        # 图片名称坐标
        loc_top = []
        # 来源坐标
        loc_bottom = []
        # 图片名称与应截取的区域y1, y2坐标
        loc_named_pic = []
        # 遍历单页的所有LT对象
        text_export = ''
        # 输出文本信息

        page_count = self.doc_pics.pageCount
        canvas_size = (0.0, 0.0, 0.0, 0.0)

        for ii in range(page_count):
            doc_pdf = self.doc_pdfs[ii]
            doc_pic = self.doc_pics[ii]
            # 将当前页转换为PNG, 返回值为图片路径
            path = self.to_pic(doc_pic, 2, ii, picPath)

            self.interpreter.process_page(doc_pdf)
            layout = self.device.get_result()
            # pdf的尺寸, tuple, (width, height)
            canvas_size = layout.bbox
            print('ii:', ii)

            for i in layout:
                # print('读取变量数据',i)
                if hasattr(i, 'get_text'):
                    text = i.get_text().strip()
                    text_export += text
                    # 匹配关键词

                    # 每个页面的相对高度
                    relativelyHeight = (page_count - ii - 1) * canvas_size[3]
                    # relativelyHeight = (page_count  - ii) * 1685/2

                    box = i.bbox  # 重新计算相对坐标

                    if re.search(r'^(图表*|表)(\s|\s*\d|\s*[::])', text):
                        # loc_top.append((i.bbox, text,ii)) ##查找出数据时,添加页码
                        if re.search(r'(\.)+\s*\d',text)==None:  ## 排除目录匹配
                            loc_top.append(((box[0], box[1] + relativelyHeight, box[2], box[3] + relativelyHeight), text,
                                            ii))  ##查找出数据时,添加页码

                    elif re.search(r'^((来源)|(资料来源)|(数据来源))(\s|[::])', text):
                        loc_bottom.append(((box[0], box[1] + relativelyHeight, box[2], box[3] + relativelyHeight), text,
                                           ii))  ##查找出数据时,添加页码
                    elif re.search(r'\n+((来源)|(资料来源)|(数据来源))(\s|[::])', text):
                        loc_bottom.append(((box[0], box[1] + relativelyHeight, box[2], box[3] + relativelyHeight), text,
                                           ii))  ##查找出数据时,添加页码
        locname = []

        image_compose(picPath, page_count)

        i0 = 0
        j0 = 0
        #        size_increase = 10 #

        # 重新计算大图的高度(big.png的宽、高)
        canvas_size = (canvas_size[0], canvas_size[1], canvas_size[2], (int(canvas_size[3] + 0.5)) * (page_count))

        name = ''
        # print(loc_top)
        # print(loc_bottom)
        print('loc_top:', len(loc_top), ',loc_bottom', len(loc_bottom))
        # 这里逻辑有点乱。将loc_top和loc_bottom依y轴坐标对齐,以找出

        if len(loc_top) == 1 and len(loc_bottom) == 0:
            try:
                name = locname[0][0][1]
            except:
                name = ''
        elif len(loc_top) > 0 and len(loc_bottom) > 0:
            while i0 <= len(loc_top) - 1 and j0 <= len(loc_bottom) - 1:
                #  print (i0,j0)
                if loc_top[i0][0][1] < loc_bottom[j0][0][1]:  # 如果尾的y轴坐标值大于头的坐标值(y轴坐标由下往上递增  范围为0到正无穷)
                    bottom = [(0, loc_bottom[j0][0][1], canvas_size[2], loc_bottom[j0][0][3]), loc_bottom[j0][1]]
                    locname.append([bottom, 1])
                    j0 += 1
                    continue
                is_binglie = 0  # 判定是否一行两个图
                try:
                    if abs(loc_top[i0][0][1] - loc_top[i0 + 1][0][1]) < 10:  # 纵坐标相差不大
                        is_binglie = 1
                except:
                    pass

                if is_binglie == 0:
                    if loc_top[i0][0][1] > loc_bottom[j0][0][1]:  # 非并列时,最正常的上下关系图情况
                        top = [(0, loc_top[i0][0][1], canvas_size[2], loc_top[i0][0][3]),
                               loc_top[i0][1]]  # (x1,y1,x2,y2)
                        locname.append([top, 0])
                        i0 += 1
                    else:
                        bottom = [(0, loc_bottom[j0][0][1], canvas_size[2], loc_bottom[j0][0][3]), loc_bottom[j0][1]]
                        locname.append([bottom, 1])
                        j0 += 1

                else:
                    is_binglie_laiyuan = 0
                    try:
                        if abs(loc_bottom[j0][0][1] - loc_bottom[j0 + 1][0][1]) < 10:
                            is_binglie_laiyuan = 2
                        else:
                            is_binglie_laiyuan = 1
                    except:
                        try:
                            loc_bottom[j0][0][1]
                            is_binglie_laiyuan = 1
                        except:
                            is_binglie_laiyuan = 0

                    if is_binglie_laiyuan == 2:
                        top1 = [(0, loc_top[i0][0][1], loc_top[i0 + 1][0][0], loc_top[i0][0][3]), loc_top[i0][1]]
                        locname.append([top1, 0])

                        bottom1 = [(0, loc_bottom[j0][0][1], loc_top[i0 + 1][0][0], loc_bottom[j0][0][3]),
                                   loc_bottom[j0][1]]
                        locname.append([bottom1, 1])

                        top2 = [(loc_top[i0 + 1][0][0], loc_top[i0 + 1][0][1], canvas_size[2], loc_top[i0 + 1][0][3]),
                                loc_top[i0 + 1][1]]
                        locname.append([top2, 0])

                        bottom2 = [
                            (loc_top[i0 + 1][0][0], loc_bottom[j0 + 1][0][1], canvas_size[2], loc_bottom[j0 + 1][0][3]),
                            loc_bottom[j0 + 1][1]]
                        locname.append([bottom2, 1])
                        i0 += 2
                        j0 += 2
                    elif is_binglie_laiyuan == 1:

                        top1 = [(0, loc_top[i0][0][1], loc_top[i0 + 1][0][0], loc_top[i0][0][3]), loc_top[i0][1]]
                        locname.append([top1, 0])

                        bottom1 = [(0, loc_bottom[j0][0][1], loc_top[i0 + 1][0][0], loc_bottom[j0][0][3]),
                                   loc_bottom[j0][1]]
                        locname.append([bottom1, 1])

                        top2 = [(loc_top[i0 + 1][0][0], loc_top[i0 + 1][0][1], canvas_size[2], loc_top[i0 + 1][0][3]),
                                loc_top[i0 + 1][1]]
                        locname.append([top2, 0])

                        bottom2 = [(loc_top[i0 + 1][0][0], loc_bottom[j0][0][1], canvas_size[2], loc_bottom[j0][0][3]),
                                   loc_bottom[j0][1]]
                        locname.append([bottom2, 1])
                        i0 += 2
                        j0 += 1
                    else:
                        top1 = [(0, loc_top[i0][0][1], loc_top[i0 + 1][0][0], loc_top[i0][0][3]), loc_top[i0][1]]
                        top2 = [(loc_top[i0 + 1][0][0], loc_top[i0 + 1][0][1], canvas_size[2], loc_top[i0 + 1][0][3]),
                                loc_top[i0 + 1][1]]
                        locname.append([top1, 0])
                        locname.append([top2, 0])
                        i0 += 2

            if i0 == len(loc_top):
                while j0 <= len(loc_bottom) - 1:
                    locname.append([loc_bottom[j0], 1])
                    j0 += 1
            if j0 == len(loc_bottom):
                while i0 <= len(loc_top) - 1:
                    locname.append([loc_top[i0], 0])
                    i0 += 1

            if i0 == len(loc_top):
                while j0 <= len(loc_bottom) - 1:
                    locname.append([loc_bottom[j0], 1])
                    j0 += 1
            if j0 == len(loc_bottom):
                while i0 <= len(loc_top) - 1:
                    locname.append([loc_top[i0], 0])
                    i0 += 1
            k = 0
            loc_named_pic = []
            print('locname:',locname)

            '''
            将locname转为loc_named_pic
            '''
            while k <= len(locname) - 1:
                #   print(k)
                if locname[0][1] == 1:  # 第一行是表尾,定义x1,x2为pdf宽度,y1为pdf顶,y2为表尾坐标
                    #修改为第一个是尾,直接忽略此数据
                    x1 = canvas_size[0]
                    x2 = canvas_size[2]
                    y1 = canvas_size[3]
                    y2 = locname[0][0][0][3]
                    name = tmp
                    # loc_named_pic.append([name, (x1, y1, x2, y2)])
                    name = ''
                    k += 1

                if locname[k][1] == 0:  # 找到第一个表头
                    name += locname[k][0][1]
                    if k + 1 < len(locname):  # k 是表头行
                        ii = k + 1
                        while ii < len(locname):  ##ii 找表尾
                            if locname[ii][1] == 0:  ## ii不是表尾
                                name += ' ' + locname[ii][0][1]
                                ii += 1
                            else:  ## ii是表尾
                                x1 = locname[k][0][0][0]
                                x2 = locname[k][0][0][2]
                                y1 = locname[k][0][0][3]
                                y2 = locname[ii][0][0][1]
                                loc_named_pic.append([name, (x1, y1, x2, y2)])
                                name = ''
                                k = ii + 1
                                ii += 1
                                continue
                        k += 1
                    else:
                        k += 1
                else:
                    k += 1

        tmp = name

        return loc_named_pic, canvas_size, tmp

    def get_crops(self, pic_path, canvas_size, position, cropped_pic_name, cropped_pic_path):
        """
        按给定位置截取图片
        :param pic_path: 被截取的图片的路径
        :param canvas_size: 图片为pdf时的尺寸, tuple, (0, 0, width, height)
        :param position: 要截取的位置, tuple, (y1, y2)
        :param cropped_pic_name: 截取的图片名称
        :param cropped_pic_path: 截取的图片保存路径
        :return:
        """
        img = Image.open(pic_path)
        # 当前图片的尺寸 tuple(width, height)
        pic_size = img.size
        # 截图的范围扩大值

        count = 0
        size_increase = 10
        ##没改完
        print('canvas_size', canvas_size)
        print('position', position)

        x1 = max(position[0] - size_increase, 0) * (pic_size[0] / canvas_size[2])
        x2 = min(position[2] + size_increase, canvas_size[2]) * (pic_size[0] / canvas_size[2])
        #  y1 = pic_size[1] * (1 - (position[0] + size_increase)/canvas_size[3])
        #  y2 = pic_size[1] * (1 - (position[1] - size_increase)/canvas_size[3])
        y1 = max(0, (1 - (position[1] + size_increase) / canvas_size[3]) * pic_size[1])
        y2 = min(pic_size[1], (1 - (position[3] - size_increase) / canvas_size[3]) * pic_size[1])

        print('crop')
        print(x1, x2, y1, y2)

        cropped_img = img.crop((x1, y1, x2, y2))
        cropped_pic_name = cropped_pic_name + str(count)
        cropped_pic_name = cropped_pic_name.replace('/', '')
        cropped_pic_name = cropped_pic_name.replace('  ', '')
        cropped_pic_name = cropped_pic_name.replace('\\', '')
        cropped_pic_name = cropped_pic_name.replace(':', '')
        cropped_pic_name = cropped_pic_name.replace('*', '')
        cropped_pic_name = cropped_pic_name.replace('?', '')
        cropped_pic_name = cropped_pic_name.replace('"', '')
        cropped_pic_name = cropped_pic_name.replace('<', '')
        cropped_pic_name = cropped_pic_name.replace('>', '')
        cropped_pic_name = cropped_pic_name.replace('|', '')
        cropped_pic_name = cropped_pic_name.replace('\n', '')
        cropped_pic_name = cropped_pic_name.replace('\r', '')
        cropped_pic_name = cropped_pic_name.replace('\f', '')
        if len(cropped_pic_name) > 50:
            cropped_pic_name = cropped_pic_name[0:49]
        count += 1

        import random

        rand0 = str(random.randint(10000000, 99999999))
        text0 = []
        log0 = []
        try:
            path = os.path.join(cropped_pic_path, rand0) + '.png'
            cropped_img.save(path)
            text0 = cropped_pic_name + '|' + rand0 + '|' + str(x1) + '|' + str(x2) + '|' + str(y1) + '|' + str(y2)
            # print(text0)
            return text0, log0

        # print('成功截取图片:', cropped_pic_name)
        except:
            log0 = cropped_pic_path + cropped_pic_name
            print('失败', cropped_pic_name)
            return text0, log0
            # pass

    def main(self, pic_path, cropped_pic_path, pgn=None, tmp=''):
        """
        主函数
        :param pic_path: 被截取的图片路径
        :param cropped_pic_path: 图片的截图的保存路径
        :param pgn: 指定获取截图的对象的索引
        :return:
        """
        text_total = []
        log_total = []
        topNumber = 0
        bottomNumber = 0
        if pgn is not None:
            # 获取当前页的doc
            # doc_pdf = self.doc_pdfs[pgn]
            # doc_pic = self.doc_pics[pgn]
            # 将当前页转换为PNG, 返回值为图片路径
            # path = self.to_pic(doc_pic, 2, pgn, pic_path)
            page_count = self.doc_pics.pageCount

            loc_name_pic, canvas_size, tmp = self.get_pic_loc(tmp=tmp, picPath=pic_path)

            print('loc_name_pic', loc_name_pic)
            print('canvas_size', canvas_size)
            print('tmp', tmp)
            print('page_count', page_count)

            # canvas_size = (canvas_size[0], canvas_size[1], canvas_size[2],(int(canvas_size[3]+0.5) ) * (page_count))
            print('canvas_size', canvas_size)

            if loc_name_pic:
                for i in loc_name_pic:
                    position = i[1]
                    cropped_pic_name = re.sub('/', '_', i[0])
                    # text1, log1 = self.get_crops(path, canvas_size, position, cropped_pic_name, cropped_pic_path)
                    text1, log1 = self.get_crops(pic_path + '\\big.png', canvas_size, position, cropped_pic_name,
                                                 cropped_pic_path)

                    if text1:
                        text1 = text1 + '|' + str(pgn)
                        text_total.append(text1)

                        ##写入文件

                    if log1:
                        log1 = log1 + '|第' + str(pgn) + '页出错'
                        log_total.append(log1)
        return tmp, text_total, log_total, topNumber, bottomNumber
Example #38
0
def parse(pdf_path):
    global eps
    # 保存文本内容
    key = pdf_path.split('/')[-1]
    print('extracting from ', key)
    fp = open(pdf_path, 'rb')  # 以二进制读模式打开
    # 用文件对象来创建一个pdf文档分析器
    parser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器 与文档对象
    parser.set_document(doc)
    doc.set_parser(parser)

    # 提供初始化密码
    # 如果没有密码 就创建一个空的字符串
    try:
        doc.initialize()
    except PDFEncryptionError:
        return

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        return
        # raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf 资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # 用来计数页面,图片,曲线,figure,水平文本框等对象的数量
        num_page, num_image, num_curve, num_figure, num_TextBoxHorizontal = 0, 0, 0, 0, 0

        # 循环遍历列表,每次处理一个page的内容
        for page in doc.get_pages():  # doc.get_pages() 获取page列表
            num_page += 1  # 页面增一
            pre_sent = {'text': '', 'height': 0, 'left': 0, 'width': 0}
            post_sent = {'text': '', 'height': 0, 'left': 0, 'width': 0}
            pre_flag = False
            post_flag = False
            try:
                interpreter.process_page(page)
            except KeyError:
                continue
            except AssertionError:
                continue
            except OSError:
                continue
            f = open(key[:-4] + '.txt', 'a', encoding='utf-8')
            f.write('\n\n')
            f.close()
            # 接受该页面的LTPage对象
            layout = device.get_result()

            text_dic_list = []  #建立空字典链表,其值为宽度相同的字符串的拼接
            for x in layout:
                if isinstance(x, LTImage):  # 图片对象
                    num_image += 1
                if isinstance(x, LTCurve):  # 曲线对象
                    num_curve += 1
                if isinstance(x, LTFigure):  # figure对象
                    num_figure += 11
                if isinstance(x, LTTextBoxHorizontal):  # 获取文本内容
                    num_TextBoxHorizontal += 1  # 水平文本框对象增一
                    # results = x.get_text()
                    results = ""
                    for i in x._objs:
                        for j in i._objs:
                            temch = j._text[0]
                            for w in range(1, len(j._text)):
                                if is_chinese(j._text[w]):
                                    ch = j._text[w]
                                    break
                            results += temch
                    height = x._avg_lineheight
                    for gethei in range(len(x._objs[0]._objs)):
                        if is_chinese(x._objs[0]._objs[gethei]._text[0]):
                            height = x._objs[0]._objs[gethei].height

                    if match_pattern(results):  #检测是否符合启发式规则
                        nresults = spe_pun_drop(results)
                        inserted = False
                        for item in text_dic_list:  #主要清洗方式,为联合在pdf分行的同一个语句,将同一页中所有宽度相同的句子联合,并用空格分隔
                            if (abs(item['hide'] - (height)) < eps
                                ) and abs(item['left'] - x.x0) < 5 * height:
                                if pre_flag and abs(
                                        pre_sent['height'] -
                                        height) < eps and pre_sent[
                                            'width'] >= x.width - height * 5:
                                    nresults = pre_sent['text'] + nresults
                                if (item['y0'] -
                                        x.y0) > 4 * height and nresults[
                                            0] != '\n':  #平行段落之间添加换行符
                                    nresults = '\n' + nresults
                                item['text'] += (nresults)
                                if x.x0 > item['left']:
                                    item['left'] = x.x0
                                if x.y0 < item["y0"]:
                                    item["y0"] = x.y0
                                inserted = True
                                break
                        if not inserted:
                            if pre_flag and abs(
                                    pre_sent['height'] -
                                    height) < eps and pre_sent[
                                        'width'] >= x.width - height * 5:
                                nresults = pre_sent['text'] + nresults
                            text_dic_list.append({
                                'hide': height,
                                'left': x.x0,
                                'width': x.width,
                                'text': nresults,
                                "y0": x.y0
                            })
                        pre_flag = False
                        post_flag = True
                    else:
                        fun_flag = False  #处理pdf文段最后一句话不被添加至句子中
                        if post_flag == True:
                            nresults = spe_pun_drop(results)
                            for item in text_dic_list:
                                if (abs(item['hide'] - (height)) < eps and
                                        abs(item['left'] - x.x0) < height * 5):
                                    item['text'] += (nresults)
                                    fun_flag = True
                                    break
                            if fun_flag == False:  #处理连续的末尾没有标点
                                post_flag = False
                        if not fun_flag:
                            if pre_flag and abs(
                                    pre_sent['height'] - height) < eps and abs(
                                        pre_sent['left'] -
                                        x.x0) < height * 5:  #处理有多行之间没有标点等情况
                                pre_sent['text'] += spe_pun_drop(results)
                            else:
                                pre_sent['text'] = spe_pun_drop(results)
                                pre_sent['height'] = height
                                pre_sent['left'] = x.x0
                                pre_sent['width'] = x.width
                            pre_flag = True
            for item in text_dic_list:
                f = open(key[:-4] + '.txt', 'a', encoding='utf-8')
                f.write(item['text'] + '\n')
                f.close()

        print('对象数量:\n', '页面数:%s\n' % num_page, '图片数:%s\n' % num_image,
              '曲线数:%s\n' % num_curve, '水平文本框:%s\n' % num_TextBoxHorizontal)
Example #39
0
def pdf_to_str(pdf_filepath):
    """Returns the contents of pdf as a string."""
    
    # Code is taken and modified from:
    # https://gist.github.com/vinovator/c78c2cb63d62fdd9fb67
    
    # pdfTextMiner.py
    # Python 2.7.6
    # For Python 3.x use pdfminer3k module
    # This link has useful information on components of the program
    # https://euske.github.io/pdfminer/programming.html
    # http://denis.papathanasiou.org/posts/2010.08.04.post.html
    
    ''' This is what we are trying to do:
    1) Transfer information from PDF file to PDF document object. This is done using parser
    2) Open the PDF file
    3) Parse the file using PDFParser object
    4) Assign the parsed content to PDFDocument object
    5) Now the information in this PDFDocumet object has to be processed. For this we need
       PDFPageInterpreter, PDFDevice and PDFResourceManager
     6) Finally process the file page by page 
    '''
    
#    my_file = os.path.join("./" + pdf_filepath)
    
    password = ""
    extracted_text = ""
    
    # Open and read the pdf file in binary mode
    fp = open(pdf_filepath, "rb")
    
    # Create parser object to parse the pdf content
    parser = PDFParser(fp)
    
    # Store the parsed content in PDFDocument object
    document = PDFDocument(parser, password)
    
    # Check if document is extractable, if not abort
    if not document.is_extractable:
    	raise PDFTextExtractionNotAllowed
    	
    # Create PDFResourceManager object that stores shared resources such as fonts or images
    rsrcmgr = PDFResourceManager()
    
    # set parameters for analysis
    laparams = LAParams()
    
    # Create a PDFDevice object which translates interpreted information into desired format
    # Device needs to be connected to resource manager to store shared resources
    # device = PDFDevice(rsrcmgr)
    # Extract the decive to page aggregator to get LT object elements
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    
    # Create interpreter object to process page content from PDFDocument
    # Interpreter needs to be connected to resource manager for shared resources and device 
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Ok now that we have everything to process a pdf document, lets process it page by page
    
    for page in PDFPage.create_pages(document):
    	# As the interpreter processes the page stored in PDFDocument object
    	interpreter.process_page(page)
    	# The device renders the layout from interpreter
    	layout = device.get_result()
    	# Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
    	for lt_obj in layout:
    		if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
    			extracted_text += lt_obj.get_text()
    			
    #close the pdf file
    fp.close()
    
    # print (extracted_text.encode("utf-8"))
    return extracted_text
Example #40
0
    def _extrace_from_words(self, pdf_path):
        # 以二进制读模式打开
        file = open(pdf_path, 'rb')
        # 用文件对象来创建一个pdf文档分析器
        parser = PDFParser(file)
        # 创建一个PDF文档对象存储文档结构,提供密码初始化,没有就不用传该参数
        doc = PDFDocument(parser, password='')
        # 检查文件是否允许文本提取
        if not doc.is_extractable:
            raise PDFTextExtractionNotAllowed
        # 创建PDf资源管理器来管理共享资源,#caching = False不缓存
        rsrcmgr = PDFResourceManager(caching=False)
        # 创建一个PDF设备对象
        laparams = LAParams()
        # 创建一个PDF页面聚合对象
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解析器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # 获得文档的目录(纲要),文档没有纲要会报错
        # PDF文档没有目录时会报:raise PDFNoOutlines  pdfminer.pdfdocument.PDFNoOutlines
        # print(doc.get_outlines())

        # 获取page列表
        # print(PDFPage.get_pages(doc))
        # 循环遍历列表,每次处理一个page的内容
        # money存储税额、金额、合计
        money = []
        # data存储服务名称
        data = ""
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象
            # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal等等
            for x in layout:
                # # 如果x是水平文本对象的话
                if (isinstance(x, LTTextBoxHorizontal)):
                    # print('kkk')
                    text_info = x.get_text()
                    # 获取的
                    if "*" in text_info:
                        if x.x0 > (layout.x1 / 2):
                            continue
                        service_info = text_info.replace("\n", " ")
                        index = service_info.index("*")
                        data += service_info[index:]

                    elif "¥" in text_info or "¥" in text_info:
                        money.append(text_info.replace("\n", "").replace(" ", ""))
        info = []
        info.append(data)
        money.sort()
        money_len = len(money)
        money_info = []
        if money_len >= 3:
            money_info = money[:3]
        elif money_len == 2:
            money_info = [0, money[0], money[1]]
        elif money_len == 1:
            money_info = [0, 0, money[0]]
        elif money_len == 0:
            money_info = [0, 0, 0]
        info.extend(money_info)
        return info
Example #41
0
class PdfExtractor:

    def __init__(self, w, h, file):
        self.w = w
        self.h = h
        self.file = file
        self.__layout_pages()
        self.layout = []

    def __layout_pages(self):
        # Open a PDF file.
        fp = open(self.file, 'rb')
        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)
        # Create a PDF document object that stores the document structure.
        # Supply the password for initialization.
        document = PDFDocument(parser)
        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()
        # Set parameters for analysis.
        laparams = LAParams(line_margin=0.1)
        # Create a PDF page aggregator object.
        self.device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # Create a PDF interpreter object.
        self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
        # Process each page contained in the document.
        self.pages = dict((pageno, page) for (pageno, page) in enumerate(PDFPage.create_pages(document)))

    def layout_page(self, page_number):
        self.interpreter.process_page(self.pages.get(page_number))  # receive the LTPage object for this page
        return self.device.get_result()

    def get_text_box_list(self, page_number):
        text_box = []
        layout = self.layout_page(page_number)
        for obj in layout:
            if isinstance(obj, LTTextBox):
                text_box.append(((obj.x0 / layout.width) * self.w,
                                    (obj.x1 / layout.width) * self.w,
                                    self.h - (obj.y0 / layout.height) * self.h,
                                    self.h - (obj.y1 / layout.height) * self.h))
                print(obj.get_text(), "   ", self.h - (obj.y0 / layout.height) * self.h -
                      (self.h - (obj.y1 / layout.height) * self.h))
        return text_box

    def get_text_box(self, obj):
        if isinstance(obj, LTTextBox):
            text_box =((obj.x0 / self.layout.width) * self.w,
                             (obj.x1 / self.layout.width) * self.w,
                             self.h - (obj.y0 / self.layout.height) * self.h,
                             self.h - (obj.y1 / self.layout.height) * self.h)
            return text_box
        return None

    def get_text_chunks(self, page_number):
        text_chunks = []
        self.layout = self.layout_page(page_number)
        for obj in self.layout:
            if isinstance(obj, LTTextBox):
                text_chunks.append(TextBox(obj.x0, obj.x1, self.h - obj.y0, self.h - obj.y1, obj.get_text().strip("\n")))
        return text_chunks
Example #42
0
    def post(self, request, *args, **kwargs):

        file_serializer = FileSerializer(data=request.data)
        if file_serializer.is_valid():
            file_serializer.save()
            file_path = "D:/file/fileupload" + file_serializer.data["file"]
            FilePointer = open(file_path, "r")
            t = file_path.split(".")
            if (t[1] == "pdf"):

                bt1 = [
                    'define', 'describe', 'draw', 'find', 'identify', 'label',
                    'list', 'locate', 'match', 'memorise', 'name', 'recall',
                    'recite', 'recognize', 'relate', 'reproduce', 'select',
                    'state', 'tell', 'write'
                ]
                bt2 = [
                    'compare', 'convert', 'demonstarte', 'describe', 'discuss',
                    'distinguish', 'explain',
                    'find out more information about', 'generalize',
                    'interpret', 'outline', 'paraphrase', 'predict',
                    'put into your own words', 'relate', 'restate',
                    'summarize', 'translate', 'visualize'
                ]
                bt3 = [
                    'apply', 'calculate', 'change', 'choose', 'complete',
                    'construct', 'examine', 'illustrate', 'interpret', 'make',
                    'manipulate', 'modify', 'produce', 'put into practice',
                    'put together', 'solve', 'show', 'translate', 'use'
                ]
                bt4 = [
                    'advertise', 'analyse', 'categoriase', 'compare',
                    'contrast', 'deduce', 'differenciate', 'distinguish',
                    'examine', 'explain', 'identify', 'investigate',
                    'seperate', 'subdivide', 'take apart'
                ]
                bt5 = [
                    'argue', 'assess', 'choose', 'compose', 'construct',
                    'create', 'criticise', 'critique', 'debate', 'decide',
                    'defend', 'design', 'determine', 'device', 'discuss',
                    'estimate', 'evaluate', 'formulate', 'imagine', 'invent',
                    'judge', 'justify', 'plan', 'predict', 'prioritise',
                    'propose', 'rate', 'recommend', 'select', 'value'
                ]
                bt6 = [
                    'add to', 'argue', 'assess', 'choose', 'combine',
                    'compose', 'construct', 'create', 'debate', 'decide',
                    'design', 'determine', 'devise', 'discuss', 'forcast',
                    'formulate', 'hypothesise', 'imagine', 'invent', 'judge',
                    'justify', 'originate', 'plan', 'predict', 'priortise',
                    'propose', 'rate', 'recommend', 'select', 'verify'
                ]
                bt = {
                    'bt1': bt1,
                    'bt2': bt2,
                    'bt3': bt3,
                    'bt4': bt4,
                    'bt5': bt5,
                    'bt6': bt6
                }
                my_file = os.path.join(file_path)
                log_file = os.path.join("D:/file/fileupload/media/log.txt")

                password = ""
                extracted_text = ""

                # Open and read the pdf file in binary mode
                fp = open(my_file, "rb")

                # Create parser object to parse the pdf content
                parser = PDFParser(fp)

                # Store the parsed content in PDFDocument object
                document = PDFDocument(parser, password)

                # Check if document is extractable, if not abort
                if not document.is_extractable:
                    raise PDFTextExtractionNotAllowed

                # Create PDFResourceManager object that stores shared resources such as fonts or images
                rsrcmgr = PDFResourceManager()

                # set parameters for analysis
                laparams = LAParams()

                # Create a PDFDevice object which translates interpreted information into desired format
                # Device needs to be connected to resource manager to store shared resources
                # device = PDFDevice(rsrcmgr)
                # Extract the decive to page aggregator to get LT object elements
                device = PDFPageAggregator(rsrcmgr, laparams=laparams)

                # Create interpreter object to process page content from PDFDocument
                # Interpreter needs to be connected to resource manager for shared resources and device
                interpreter = PDFPageInterpreter(rsrcmgr, device)

                # Ok now that we have everything to process a pdf document, lets process it page by page
                for page in PDFPage.create_pages(document):
                    # As the interpreter processes the page stored in PDFDocument object
                    interpreter.process_page(page)
                    # The device renders the layout from interpreter
                    layout = device.get_result()
                    # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
                    for lt_obj in layout:
                        if isinstance(lt_obj, LTTextBox) or isinstance(
                                lt_obj, LTTextLine):
                            extracted_text += lt_obj.get_text()

                # close the pdf file
                fp.close()
                """
          data = extracted_text.encode("utf-8").lower()
          read1 = data.split("q")

          btperq = []
          for i in range(1, len(read1)):
              btlevellist = []
              read1[i] = read1[i].translate(None, digits)
              read1[i] = re.sub('[.,!?]', '', read1[i])
              t = read1[i].split(" ")
              for word in range(len(t)):
                  for values in bt.values():
                      for keywords in values:
                          if (t[word] == keywords):
                              btlevellist.append(bt.keys()[bt.values().index(values)])
              btperq.append(btlevellist)
          senddata = {'question': read1, 'btlevel': btperq, 'list': zip(read1, btperq)}
          return Response(senddata, template_name='file.html')
           """
                return HttpResponse(nextracted_text.encode("utf-8"))

                #return Response("it is pdf")
            #response = HttpResponse(FilePointer)
            #response['Content-Disposition'] = 'attachment; filename=NameOfFile'
            #return response
            elif (t[1] == "jpg"):

                image = cv2.imread(file_path)
                gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

                # check to see if we should apply thresholding to preprocess the
                # image
                #if args["preprocess"] == "thresh":
                gray = cv2.threshold(gray, 0, 255,
                                     cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]

                # make a check to see if median blurring should be done to remove
                # noise
                # elif args["preprocess"] == "blur":
                #gray = cv2.medianBlur(gray, 3)

                # write the grayscale image to disk as a temporary file so we can
                # apply OCR to it
                filename = "{}.png".format(os.getpid())
                cv2.imwrite(filename, gray)

                # load the image as a PIL/Pillow image, apply OCR, and then delete
                # the temporary file
                text = pytesseract.image_to_string(Image.open(filename))
                os.remove(filename)
                return HttpResponse(text)
            #return Response(file_serializer.data, status=status.HTTP_201_CREATED)
            else:
                bt1 = [
                    'define', 'describe', 'draw', 'find', 'identify', 'label',
                    'list', 'locate', 'match', 'memorise', 'name', 'recall',
                    'recite', 'recognize', 'relate', 'reproduce', 'select',
                    'state', 'tell', 'write'
                ]
                bt2 = [
                    'compare', 'convert', 'demonstarte', 'describe', 'discuss',
                    'distinguish', 'explain',
                    'find out more information about', 'generalize',
                    'interpret', 'outline', 'paraphrase', 'predict',
                    'put into your own words', 'relate', 'restate',
                    'summarize', 'translate', 'visualize'
                ]
                bt3 = [
                    'apply', 'calculate', 'change', 'choose', 'complete',
                    'construct', 'examine', 'illustrate', 'interpret', 'make',
                    'manipulate', 'modify', 'produce', 'put into practice',
                    'put together', 'solve', 'show', 'translate', 'use'
                ]
                bt4 = [
                    'advertise', 'analyse', 'categoriase', 'compare',
                    'contrast', 'deduce', 'differenciate', 'distinguish',
                    'examine', 'explain', 'identify', 'investigate',
                    'seperate', 'subdivide', 'take apart'
                ]
                bt5 = [
                    'argue', 'assess', 'choose', 'compose', 'construct',
                    'create', 'criticise', 'critique', 'debate', 'decide',
                    'defend', 'design', 'determine', 'device', 'discuss',
                    'estimate', 'evaluate', 'formulate', 'imagine', 'invent',
                    'judge', 'justify', 'plan', 'predict', 'prioritise',
                    'propose', 'rate', 'recommend', 'select', 'value'
                ]
                bt6 = [
                    'add to', 'argue', 'assess', 'choose', 'combine',
                    'compose', 'construct', 'create', 'debate', 'decide',
                    'design', 'determine', 'devise', 'discuss', 'forcast',
                    'formulate', 'hypothesise', 'imagine', 'invent', 'judge',
                    'justify', 'originate', 'plan', 'predict', 'priortise',
                    'propose', 'rate', 'recommend', 'select', 'verify'
                ]
                bt = {
                    'bt1': bt1,
                    'bt2': bt2,
                    'bt3': bt3,
                    'bt4': bt4,
                    'bt5': bt5,
                    'bt6': bt6
                }
                data = FilePointer.read()
                data = data.lower()
                read1 = data.split("q")
                btperq = []
                for i in range(1, len(read1)):
                    btlevellist = []
                    read1[i] = read1[i].translate(None, digits)
                    read1[i] = re.sub('[.,!?]', '', read1[i])
                    t = read1[i].split(" ")
                    for word in range(len(t)):
                        for values in bt.values():
                            for keywords in values:
                                if (t[word] == keywords):
                                    btlevellist.append(
                                        bt.keys()[bt.values().index(values)])
                    btperq.append(btlevellist)
                senddata = {
                    'question': read1,
                    'btlevel': btperq,
                    'list': zip(read1, btperq)
                }
                return Response(senddata, template_name='file.html')

        else:
            return Response(file_serializer.errors,
                            status=status.HTTP_400_BAD_REQUEST)
def convert_pdf_to_text(pdf_path: Union[object, str], docketnum: str) -> str:
    """ Takes path (or pathlib Path object) to a PDF file, docketnum and
    returns text inside PDF"""

    # SET PATHS
    extracted_text_path = extracted_text_path_gen(dirs["extracted_text"],
                                                  docketnum)

    logging.info(f"Converting pdf to text for docket {docketnum}...")
    password = ""
    extracted_text = ""

    # Open and read the pdf file in binary mode
    fp = open(pdf_path, "rb")

    # Create parser object to parse the pdf content
    parser = PDFParser(fp)

    # Store the parsed content in PDFDocument object
    try:
        document = PDFDocument(parser, password)
    except Exception as e:
        logging.error("Something went wrong during conversion")
        logging.exception(e)
        logging.info(
            "Returning no extracted text for docket {}".format(docketnum))
        return extracted_text

    # Check if document is extractable, if not abort
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    # Create PDFResourceManager object that stores shared resources such as fonts or images
    rsrcmgr = PDFResourceManager()

    # set parameters for analysis
    laparams = LAParams()

    # Create a device object which translates interpreted information into desired format
    # Device needs to be connected to resource manager to store shared resources
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)

    # Create interpreter object to process page content from PDFDocument
    # Interpreter needs to be connected to resource manager for shared resources and device
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Ok now that we have everything to process a pdf document, lets process it page by page
    for page in PDFPage.create_pages(document):
        # As the interpreter processes the page stored in PDFDocument object
        interpreter.process_page(page)
        # The device renders the layout from interpreter
        layout = device.get_result()
        # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                extracted_text += lt_obj.get_text()
    # close the pdf file
    fp.close()

    with open(extracted_text_path, "wb") as fout:
        fout.write(extracted_text.encode("utf-8"))
    logging.info("Text extracted successfully")
    return extracted_text
Example #44
0
def parse(_path):
    fp = open(_path, 'rb')  # rb以二进制读模式打开

    # 用文件对象来创建一个pdf文档分析器
    praser_pdf = PDFParser(fp)

    # 创建一个PDF文档
    doc = PDFDocument()

    # 连接分析器 与文档对象
    praser_pdf.set_document(doc)
    doc.set_parser(praser_pdf)

    # 提供初始化密码
    # 如果没有密码 就创建一个空的字符串
    doc.initialize()

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()

        # 创建一个PDF参数分析器
        laparams = LAParams()

        # 创建聚合器
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)

        # 创建一个PDF页面解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # 循环遍历列表,每次处理一页的内容
        # doc.get_pages() 获取page列表
        for page in doc.get_pages():
            # 使用页面解释器来读取
            interpreter.process_page(page)

            # 使用聚合器获取内容
            layout = device.get_result()

            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
            for out in layout:
                # 判断是否含有get_text()方法,图片之类的就没有
                # if hasattr(out,"get_text"):
                if isinstance(out, LTTextBoxHorizontal):

                    results = out.get_text()
                    # print("results: " + results)
                    with open(r'pdf_val.txt', 'a') as f:
                        if "运输完成情况" in results:
                            target_value = results.split("\n")
                            inland_amount = target_value[10]
                            foreign_amount = target_value[12]
                            print("国内货邮运输量:", inland_amount, "国际货邮运输量:",
                                  foreign_amount)
                            f.write("国内货邮运输量:" + inland_amount + ",国际货邮运输量:" +
                                    foreign_amount + "\n")
                            f.close()
                            break
Example #45
0
def uploaded_file(filename):
    # Read fie
    pdf = pdfquery.PDFQuery('UPLOAD_FOLDER/pdf_temp.pdf')
    pdf.load()

    # Save xml tree
    pdf.tree.write('UPLOAD_FOLDER/test.xml', pretty_print=True)
    pq_items = pdf.pq('LTTextBoxVertical, LTTextLineHorizontal')
    items = pd.DataFrame(
        columns=['name', 'x0', 'x1', 'y0', 'y1', 'height', 'width', 'page_num'])

    for pq in pq_items:
        page_pq = next(pq.iterancestors('LTPage'))  # Use just the first ancestor
        page_num = page_pq.layout.pageid

        cur_str_item = str(pq.layout)

        tmp_items = pd.DataFrame([[
            get_name(cur_str_item),
            float(get_coordinates(cur_str_item)[0]),
            float(get_coordinates(cur_str_item)[2]),
            float(get_coordinates(cur_str_item)[1]),
            float(get_coordinates(cur_str_item)[3])
        ]],
            columns=['name', 'x0', 'x1', 'y0', 'y1'])

        # tmp_items['height'] = tmp_items['y1'] - tmp_items['y0']
        # tmp_items['width'] = tmp_items['x1'] - tmp_items['x0']

        tmp_items['height'] = get_diff3(tmp_items['y1'], tmp_items['y0'])
        tmp_items['width'] = get_diff3(tmp_items['x1'], tmp_items['x0'])

        tmp_items['page_num'] = page_num

        items = items.append(tmp_items, ignore_index=True)

    # PDF converted to DF
    items = items.sort_values(['page_num', 'x0', 'y1'], ascending=[True, True, False])
    items.reset_index(inplace=True, drop=True)

    #H destribution
    heights = pd.crosstab(index=items["height"], columns="count")
    heights = heights[heights['count'] > 1]

    cat_h = round3(max(heights[heights['count'] >= min_dish_count].index.values))
    tmp = heights[heights['count'] >= min_dish_count].index.values
    item_h = round3(max(tmp[tmp < cat_h]))

    # Plot all boxes
    pdf_boundary_boxes(
        df=items, path_input='UPLOAD_FOLDER/pdf_temp.pdf', path_output='UPLOAD_FOLDER/temp.pdf', r=50, g=0, b=100)

    ########################      Get categoties ####################################

    cat_list = items[items['height'].between(0.99 * cat_h, 1.01 * cat_h)]
    cat_char_w = cat_list.apply(lambda row: mean_char(row['width'], row['name']), axis=1).median()
    cat_char_w_max = cat_list.apply(lambda row: mean_char(row['width'], row['name']), axis=1).max()

    #Collapse  rows with  cat
    cat_list = collapse_rows(cat_list, sense=1.03)
    cat_list = cat_list.sort_values(['page_num', 'y1', 'x0'], ascending=[True, False, True])

    filter = cat_list["name"] != ' '
    cat_list = cat_list[filter]
    cat_list = cat_list.reset_index(drop=True)

    #Draw categories boxes
    pdf_boundary_boxes(df=cat_list, path_input='UPLOAD_FOLDER/pdf_temp.pdf', show_height=False,
                       show_number=True, path_output='UPLOAD_FOLDER/temp1.pdf')


    #################### Get items ###############################################

    items_list = items[items['height'].between(0.99 * item_h, 1.01 * item_h)]
    items_list = items_list.reset_index(drop=True)
    items_list = collapse_rows(items_list)

    # Delete empty items
    filter = items_list["name"] != ' '
    items_list = items_list[filter]
    items_list = items_list.reset_index(drop=True)

    # Get dishes
    patternDel = "^[0-9 \. \/]+$"
    filter = items_list['name'].str.contains(patternDel)
    dishes_list = items_list[~filter]
    dishes_list = dishes_list.reset_index(drop=True)

    # Dishes to layout
    pdf_boundary_boxes(
        df=dishes_list,
        path_input="UPLOAD_FOLDER/temp1.pdf",
        path_output="UPLOAD_FOLDER/temp_dishes.pdf",
        show_height=False,
        r=0,
        g=0,
        b=230)

    # Get prices
    prices_list = items_list[~items_list.name.isin(dishes_list.name)]
    prices_list = prices_list.reset_index(drop=True)

    # Prices to layout
    pdf_boundary_boxes(
        df=prices_list,
        path_input="UPLOAD_FOLDER/temp_dishes.pdf",
        path_output="UPLOAD_FOLDER/temp_dishes_prices.pdf",
        show_height=False,
        r=230,
        g=0,
        b=0)

    ################################# Second algo ###################################

    fp = open('UPLOAD_FOLDER/pdf_temp.pdf', 'rb')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = PDFPage.get_pages(fp)

    #Show new strucure
    for page in pages:
        print('Processing next page...')
        interpreter.process_page(page)
        layout = device.get_result()
        for lobj in layout:
            if isinstance(lobj, LTTextBox):
                x, y, text = lobj.bbox[0], lobj.bbox[3], lobj.get_text()

    #Get cat-s (Only for 1 page)
    cat_n = pd.DataFrame(columns=['name', 'x0', 'x1', 'y0', 'y1', 'height', 'width', 'page_num'])
    for lobj in layout:
        if isinstance(lobj, LTTextBox):
            x0, y1, text = lobj.bbox[0], lobj.bbox[3], lobj.get_text().split("\n")[0]
            x1, y0 = lobj.bbox[2], lobj.bbox[1]
            tmp = cat_list[cat_list['y1'].between(0.97 * y1, 1.03 * y1)].copy()
            tmp['name'] = text
            if len(cat_list[cat_list['y1'].between(0.97 * y1, 1.03 * y1)]['name']) > 0:
                if (text != cat_list[cat_list['y1'].between(0.97 * y1, 1.03 * y1)]['name'].values[0]):
                    tmp['x0'] = x0

            cat_n = cat_n.append(tmp, ignore_index=True)

    #Re-draw new layout with cat-s

    pdf_boundary_boxes(
        df=cat_n,
        show_height=False,
        show_number=True,
        path_input="UPLOAD_FOLDER/temp_dishes_prices.pdf",
        path_output="UPLOAD_FOLDER/temp_cat_n.pdf",
    )

    #Get prices
    pq_items1 = pdf.pq('LTTextLineVertical')

    items1 = pd.DataFrame(
        columns=['name', 'x0', 'x1', 'y0', 'y1', 'height', 'width', 'page_num'])

    for pq in pq_items1:
        page_pq = next(pq.iterancestors('LTPage'))  # Use just the first ancestor
        page_num = page_pq.layout.pageid

        cur_str_item = str(pq.layout)

        tmp_items = pd.DataFrame([[
            get_name(cur_str_item),
            float(get_coordinates(cur_str_item)[0]),
            float(get_coordinates(cur_str_item)[2]),
            float(get_coordinates(cur_str_item)[1]),
            float(get_coordinates(cur_str_item)[3])
        ]],
            columns=['name', 'x0', 'x1', 'y0', 'y1'])

        # tmp_items['height'] = tmp_items['y1'] - tmp_items['y0']
        # tmp_items['width'] = tmp_items['x1'] - tmp_items['x0']

        tmp_items['height'] = get_diff3(tmp_items['y1'], tmp_items['y0'])
        tmp_items['width'] = get_diff3(tmp_items['x1'], tmp_items['x0'])

        tmp_items['page_num'] = page_num

        items1 = items1.append(tmp_items, ignore_index=True)

    items1 = items1.sort_values(['page_num', 'x0', 'y1'], ascending=[True, True, False])
    items1.reset_index(inplace=True, drop=True)

    patternDel = '^ *\d[\d ]*$'
    filter = items1['name'].str.contains(patternDel)
    items1 = items1[filter]
    items1 = items1.reset_index(drop=True)

    prices_n = pd.DataFrame(columns=['name', 'x0', 'x1', 'y0', 'y1'])

    for i in range(0, len(items1)):
        big_prices = items1.iloc[i]['name']
        height_a = items1.iloc[i]['height'] / len(big_prices)

        tmp_len = len(big_prices)
        for j in range(0, tmp_len):
            # tmp_prices_n = pd.DataFrame(columns=['name', 'x0', 'x1', 'y0', 'y1', 'height', 'width', 'page_num'])
            tmp_name = items1.iloc[i]['name'][j]
            y1_temp = items1.iloc[i]['y1'] - j * height_a
            y0_temp = items1.iloc[i]['y0'] + j * height_a
            x0, x1 = items1.iloc[i]['x0'], items1.iloc[i]['x1']

            tmp_prices_n = pd.DataFrame({
                'name': [tmp_name],
                'x0': x0,
                'x1': x1,
                'y0': y0_temp,
                'y1': y1_temp
            },
                index=[0])

            prices_n = prices_n.append(tmp_prices_n, ignore_index=True)

    prices_n = prices_n.sort_values(['x0', 'y1'], ascending=[True, False])
    prices_n.reset_index(inplace=True, drop=True)

    #Draw new layout

    pdf_boundary_boxes(
        df=prices_n,
        path_input="UPLOAD_FOLDER/temp_cat_n.pdf",
        path_output="UPLOAD_FOLDER/temp_dishes_prices_n.pdf",
        show_height=False,
        r=230,
        g=0,
        b=0)


    #return 'Done'
    return send_from_directory(upload_path, 'temp_dishes_prices_n.pdf')
Example #46
0
# 出力用のテキストファイル
output_txt = open('output.txt', 'w')


def print_and_write(txt):
    print(txt)
    output_txt.write(txt)
    output_txt.write('\n')


with open(sys.argv[1], 'rb') as f:
    # PDFPage.get_pages()にファイルオブジェクトを指定して、PDFPageオブジェクトを順に取得する。
    # 時間がかかるファイルは、キーワード引数pagenosで処理するページ番号(0始まり)のリストを指定するとよい。
    for page in PDFPage.get_pages(f):
        print_and_write('\n====== ページ区切り ======\n')
        interpreter.process_page(page)  # ページを処理する。
        layout = device.get_result()  # LTPageオブジェクトを取得。

        # ページ内のテキストボックスのリストを取得する。
        boxes = find_textboxes_recursively(layout)

        # テキストボックスの左上の座標の順でテキストボックスをソートする。
        # y1(Y座標の値)は上に行くほど大きくなるので、正負を反転させている。
        boxes.sort(key=lambda b: (-b.y1, b.x0))

        for box in boxes:
            print_and_write('-' * 10)  # 読みやすいよう区切り線を表示する。
            print_and_write(box.get_text().strip())  # テキストボックス内のテキストを表示する。

output_txt.close()
Example #47
0
def parse_pdf(file_path, method='tika'):
    """
    Given a PDF file complete path, the function parses the file, counts the number of pages and checks if
    it is text-extractable.

    Parameters
    ----------
    file_path: string
        Complete path to output file.
    method: string
        Method used to extract the text: 'pdfminer', 'pypdf', 'tika'.

    Return
    ------
    extracted_text: string
        Text extracted from the document.
    number_of_pages:
        Number of pages of the document.
    """

    if method == 'pdfminer':
        with open(file_path, "rb") as fp:
            # Create parser object to parse the pdf content
            pdf_parser = PDFParser(fp)

            # Store the parsed content in PDFDocument object
            document = PDFDocument(pdf_parser)

            # Check if document is text-extractable or not
            is_extractable = document.is_extractable

            # Check if document is extractable, if not abort
            if not is_extractable:
                raise PDFTextExtractionNotAllowed

            # Create PDFResourceManager object that stores shared resources such as fonts or images
            rsrcmgr = PDFResourceManager()

            # Set parameters for analysis
            laparams = LAParams()

            # Extract the decive to page aggregator to get LT object elements
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)

            # Create interpreter object to process page content from PDFDocument
            interpreter = PDFPageInterpreter(rsrcmgr, device)

            extracted_text = ""
            number_of_pages = 0

            # Process PDF document page by page
            for page in PDFPage.create_pages(document):
                number_of_pages = number_of_pages + 1
                extracted_text += f"[Page {number_of_pages}]\n"

                # As the interpreter processes the page stored in PDFDocument object
                interpreter.process_page(page)

                # The device renders the layout from interpreter
                layout = device.get_result()

                # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
                for lt_obj in layout:
                    if isinstance(lt_obj, LTTextBox) or isinstance(
                            lt_obj, LTTextLine):
                        extracted_text += lt_obj.get_text()

    if method == 'pypdf':
        with open(file_path, 'rb') as f:
            pdf = PdfFileReader(f)
            number_of_pages = pdf.getNumPages()
            extracted_text = ''.join([
                f'[Page {i}]\n' + pdf.getPage(i).extractText()
                for i in range(number_of_pages)
            ])

    if method == 'tika':
        raw = parser.from_file(file_path)
        extracted_text = raw['content']
        number_of_pages = int(raw['metadata']['xmpTPg:NPages'])

    else:
        logging.error(f'Text extractor method {method} not found')

    return extracted_text, number_of_pages
Example #48
0
    def load_file_text(self):
        """ Import from file types of odt, docx pdf, epub, txt, html, htm.
        Implement character detection for txt imports.
        Do not link the new text, load it instead.
        Delete old project folder file, insert new file int project folder.
        Update database entry and keep same id.

        param:
            import_file: filepath of file to be imported, String
            link_path:  filepath of file to be linked, String
        """

        text = ""
        # Import from odt
        if self.new_file_path[-4:].lower() == ".odt":
            text = self.convert_odt_to_text(self.new_file_path)
            text = text.replace(
                "\n",
                "\n\n")  # add line to paragraph spacing for visual format
        # Import from docx
        if self.new_file_path[-5:].lower() == ".docx":
            document = opendocx(self.new_file_path)
            list_ = getdocumenttext(document)
            text = "\n\n".join(
                list_)  # add line to paragraph spacing for visual format
        # Import from epub
        if self.new_file_path[-5:].lower() == ".epub":
            book = epub.read_epub(self.new_file_path)
            for d in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
                try:
                    bytes_ = d.get_body_content()
                    string = bytes_.decode('utf-8')
                    text += html_to_text(
                        string
                    ) + "\n\n"  # add line to paragraph spacing for visual format
                except TypeError as e:
                    logger.debug("ebooklib get_body_content error " + str(e))
        # Import PDF
        if self.new_file_path[-4:].lower() == '.pdf':
            fp = open(self.new_file_path, 'rb')  # read binary mode
            parser = PDFParser(fp)
            doc = PDFDocument(parser=parser)
            parser.set_document(doc)
            # Potential error with encrypted PDF
            rsrcmgr = PDFResourceManager()
            laparams = LAParams()
            laparams.char_margin = 1.0
            laparams.word_margin = 1.0
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.create_pages(doc):
                interpreter.process_page(page)
                layout = device.get_result()
                for lt_obj in layout:
                    if isinstance(lt_obj, LTTextBox) or isinstance(
                            lt_obj, LTTextLine):
                        text += lt_obj.get_text(
                        ) + "\n"  # add line to paragraph spacing for visual format
            # Remove excess line endings, include those with one blank space on a line
            text = text.replace('\n \n', '\n')
            text = text.replace('\n\n\n', '\n\n')
        # Import from html
        if self.new_file_path[-5:].lower(
        ) == ".html" or self.new_file_path[-4:].lower() == ".htm":
            with open(self.new_file_path, "r") as sourcefile:
                file_text = ""
                while 1:
                    line_ = sourcefile.readline()
                    if not line_:
                        break
                    file_text += line_
                text = html_to_text(file_text)
        # Try importing as a plain text file.
        if text == "":
            import_errors = 0
            try:
                # Can get UnicodeDecode Error on Windows so using error handler
                with open(self.new_file_path,
                          "r",
                          encoding="utf-8",
                          errors="backslashreplace") as sourcefile:
                    while 1:
                        line = sourcefile.readline()
                        if not line:
                            break
                        try:
                            text += line
                        except Exception:
                            import_errors += 1
                    # Associated with notepad files
                    if text[0:6] == "\ufeff":
                        text = text[6:]
            except Exception as e:
                msg = _("Cannot import") + str(
                    self.new_file_path) + "\n" + str(e)
                Message(self.app, _("Warning"), msg, "warning").exec_()
                return
            if import_errors > 0:
                Message(self.app, _("Warning"),
                        str(import_errors) + _(" lines not imported"),
                        "warning").exec_()
                logger.warning(self.new_file_path + ": " + str(import_errors) +
                               _(" lines not imported"))
        # Import of text file did not work
        if text == "":
            msg = str(
                self.new_file_path) + _("\nPlease check if the file is empty.")
            Message(self.app, _("Warning"),
                    _("Cannot import ") + msg, "warning").exec_()
            return

        name_split = self.new_file_path.split("/")
        filename = name_split[-1]

        cur = self.app.conn.cursor()
        # Remove old file from project folder
        cur.execute("select mediapath from source where id=?",
                    [self.old_file['id']])
        res = cur.fetchone()
        if res[0] is None:  # Internal file
            old_filepath = self.app.project_path + "/documents/" + self.old_file[
                'name']
            try:
                os.remove(old_filepath)
            except FileNotFoundError as e:
                logger.warning(_("Deleting file error: ") + str(e))
        # Insert new file into project folder
        copyfile(self.new_file_path,
                 self.app.project_path + "/documents/" + filename)
        # Update old file entry to new file
        mediapath = None
        '''if link_path != "":
            mediapath = link_path'''
        self.new_file = {
            'name': filename,
            'id': self.old_file['id'],
            'fulltext': text,
            'mediapath': mediapath,
            'memo': self.old_file['memo'],
            'owner': self.app.settings['codername'],
            'date': datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }
        cur.execute(
            "update source set name=?,fulltext=?,mediapath=?,owner=?,date=? where id=?",
            (self.new_file['name'], self.new_file['fulltext'],
             self.new_file['mediapath'], self.new_file['owner'],
             self.new_file['date'], self.old_file['id']))
        self.app.conn.commit()
Example #49
0
filepath = 'C:/Users/lenovo/Desktop/ACL2020'
list1 = os.listdir(filepath)
list_words = []
for i in range(len(list1)):
    outs = ""
    fp = open(filepath + '/' + list1[i], 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser=parser)
    parser.set_document(doc=doc)
    resource = PDFResourceManager()
    laparam = LAParams()
    device = PDFPageAggregator(resource, laparams=laparam)
    interpreter = PDFPageInterpreter(resource, device)
    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
        layout = device.get_result()
        for out in layout:
            if hasattr(out, 'get_text'):
                outs = out.get_text() + outs
    outs = outs.lower().replace('\n', '')
    english_pu = ['’', '“', '“']
    punctuation_map = dict((ord(char), None) for char in string.punctuation)
    without_punctuation = outs.translate(punctuation_map)  # 去除文章标点符号
    raw_words = nltk.word_tokenize(
        without_punctuation)  # 将文章进行分词处理,将一段话转变成一个list
    wordnet_lematizer = WordNetLemmatizer()
    words = [wordnet_lematizer.lemmatize(raw_word) for raw_word in raw_words]
    # 去除停用词
    filtered_words = [
        word for word in words if word not in stopwords.words('english')
        and word not in english_pu and len(word) > 2 and word.isdigit() != True
Example #50
0
def pdf2csv(fp):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument(parser)
    print doc
    # Connect the parser and document objects.
    # parser.set_document(doc)
    # doc.set_parser(parser)
    # Supply the password for initialization.
    # (If no password is set, give an empty string.)
    doc.initialize('')
    # Check if the document allows text extraction. If not, abort.
    # if not doc.is_extractable:
    #     raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Set parameters for analysis.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    layout = device.get_result()

    for pageno, page in enumerate(doc.get_pages()):
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        #import code; code.interact(local=locals());
        hlines = []
        vlines = []
        for i in layout:
            if not type(i) in (LTRect, LTLine): continue
            hlines.append(int(i.x0))
            hlines.append(int(i.x1))
            vlines.append(int(layout.height - i.y0))
            vlines.append(int(layout.height - i.y1))
        hlines = filterclose(sorted(set(hlines)))
        vlines = filterclose(sorted(set(vlines)))
        print hlines
        print vlines
        print(layout.width, layout.height)
        i = 0
        im = Image.new('1', (int(layout.width), int(layout.height)))
        draw = ImageDraw.Draw(im)
        while (i < len(vlines) - 1):
            if not vlines[i + 1] - vlines[i] > 5:
                i = i + 1
                continue
            j = 0
            while (j < len(hlines) - 1):
                if not hlines[j + 1] - hlines[j] > 5:
                    j = j + 1
                    continue
                draw.rectangle([(int(hlines[j]), int(vlines[i])),
                                (int(hlines[j + 1]), int(vlines[i + 1]))],
                               outline=1)
                j = j + 1
            i = i + 1
        del draw
        fp = open("out%s.png" % pageno, 'wb')
        im.save(fp, "PNG")
        fp.close()
    def getText(self):
        """ Algorithm:
        1) Txr information from PDF file to PDF document object using parser
        2) Open the PDF file
        3) Parse the file using PDFParser object
        4) Assign the parsed content to PDFDocument object
        5) Now the information in this PDFDocumet object has to be processed.
        For this we need PDFPageInterpreter, PDFDevice and PDFResourceManager
        6) Finally process the file page by page
        """

        # Open and read the pdf file in binary mode
        with open(self.pdf_file_path, "rb") as fp:

            # Create parser object to parse the pdf content
            parser = PDFParser(fp)

            # Store the parsed content in PDFDocument object
            document = PDFDocument(parser, self.password)

            # Check if document is extractable, if not abort
            if not document.is_extractable:
                raise PDFTextExtractionNotAllowed

            # Create PDFResourceManager object that stores shared resources
            # such as fonts or images
            rsrcmgr = PDFResourceManager()

            # set parameters for analysis
            laparams = LAParams()

            # Create a PDFDevice object which translates interpreted
            # information into desired format
            # Device to connect to resource manager to store shared resources
            # device = PDFDevice(rsrcmgr)
            # Extract the decive to page aggregator to get LT object elements
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)

            # Create interpreter object to process content from PDFDocument
            # Interpreter needs to be connected to resource manager for shared
            # resources and device
            interpreter = PDFPageInterpreter(rsrcmgr, device)

            # Initialize the text
            extracted_text = ""

            # Ok now that we have everything to process a pdf document,
            # lets process it page by page
            for page in PDFPage.create_pages(document):
                # As the interpreter processes the page stored in PDFDocument
                # object
                interpreter.process_page(page)
                # The device renders the layout from interpreter
                layout = device.get_result()
                # Out of the many LT objects within layout, we are interested
                # in LTTextBox and LTTextLine
                for lt_obj in layout:
                    if (isinstance(lt_obj, LTTextBox)
                            or isinstance(lt_obj, LTTextLine)):
                        extracted_text += lt_obj.get_text()

        return extracted_text.encode("utf-8")
Example #52
0
def get_pdf_rows(data, miner_layout=True):
    """
    Takes PDF file content as string and yield table row data for each page.

    For each page in the PDF, the function yields a list of rows.
    Each row is a list of cells. Each cell is a list of strings present in the cell.
    Note that the rows may belong to different tables.

    There are no logic tables in PDF format, so this parses PDF drawing instructions
    and tries to find rectangles and arrange them in rows, then arrange text in
    the rectangles.

    External dependencies:
    PDFMiner (http://www.unixuser.org/~euske/python/pdfminer/index.html).
    """

    try:
        from pdfminer.pdfparser import PDFParser, PDFSyntaxError
    except ImportError:
        raise ImportError('Please install python-pdfminer')

    try:
        from pdfminer.pdfdocument import PDFDocument
        from pdfminer.pdfpage import PDFPage
        newapi = True
    except ImportError:
        from pdfminer.pdfparser import PDFDocument
        newapi = False
    from pdfminer.converter import PDFPageAggregator
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.layout import LAParams, LTRect, LTTextBox, LTTextLine, LTLine, LTChar

    parser = PDFParser(BytesIO(data))
    try:
        if newapi:
            doc = PDFDocument(parser)
        else:
            doc = PDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
    except PDFSyntaxError:
        return

    rsrcmgr = PDFResourceManager()
    if miner_layout:
        device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
    else:
        device = PDFPageAggregator(rsrcmgr)

    interpreter = PDFPageInterpreter(rsrcmgr, device)
    if newapi:
        pages = PDFPage.get_pages(BytesIO(data), check_extractable=True)
    else:
        doc.initialize()
        pages = doc.get_pages()

    for npage, page in enumerate(pages):
        interpreter.process_page(page)
        page_layout = device.get_result()

        texts = sum([
            list(lttext_to_multilines(obj, page_layout))
            for obj in page_layout._objs
            if isinstance(obj, (LTTextBox, LTTextLine, LTChar))
        ], [])
        if not miner_layout:
            texts.sort(key=lambda t: (t.y0, t.x0))

        lines = list(
            uniq_lines(
                lt_to_coords(obj, page_layout) for obj in page_layout._objs
                if isinstance(obj, (LTRect, LTLine))))

        boxes = build_rows(lines)
        textrows = arrange_texts_in_rows(boxes, texts)

        yield textrows
    device.close()
Example #53
0
def split_pdf(filename):
    # Splits a race form into individual horses and headers.
    # extract the track abbreviation and date using regular expression
    prog = re.compile(r'([A-Z]+)--(\d+)-(\d+)-(\d+) ?\(?(\d*)\)?\.pdf')
    m = prog.match(filename)
    race_date = m.group(1) + m.group(4)[-2:] + m.group(2) + m.group(3) + '_'
    # open pdf for reading
    fp = open(os.path.join(folder, filename), 'rb')
    # set up the pdfminer parser to be used to extract the location of the y coordinates
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # set up loop variables
    global end_index
    global new_race
    global race_num
    global horse_num
    page_num = 0
    end_index = None
    new_race = False
    # set up PyPDF2 reader to split the race forms
    reader = PyPDF2.PdfFileReader(fp)
    for page in doc.get_pages():
        # split each page in the race form
        print(page_num)
        # get the y coordinates
        global ycoords
        ycoords = []
        interpreter.process_page(page)
        layout = device.get_result()
        find_lines(layout)
        page = reader.getPage(page_num)
        if new_race:
            # Extract header if there is one
            new_race = False
            race_num = None
            writer = PyPDF2.PdfFileWriter()
            page.mediaBox.upperRight = (page.mediaBox[2], ycoords.pop(0))
            page.mediaBox.lowerLeft = (page.mediaBox[0], ycoords.pop(0))
            print(page.mediaBox)
            get_race_num(layout, page.mediaBox[1], page.mediaBox[3])
            out_file = race_date + race_num + '_header.pdf'
            out_file = os.path.join(new_folder, out_file)
            writer.addPage(page)
            with open(out_file, 'wb') as outfp:
                writer.write(outfp)
        while len(ycoords) > 1:
            # get each horse from the page
            writer = PyPDF2.PdfFileWriter()
            top = ycoords.pop(0)
            page.mediaBox.upperRight = (page.mediaBox[2], top)
            page.mediaBox.lowerLeft = (page.mediaBox[0], ycoords[0])
            horse_num = None
            get_horse_num(layout, page.mediaBox[1], page.mediaBox[3])
            writer.addPage(page)
            out_file = os.path.join(new_folder, race_date + race_num + '_' + horse_num + '.pdf')
            with open(out_file, 'wb') as outfp:
                writer.write(outfp)
        # get the last horse of the page
        top = ycoords.pop(0)
        find_endofpage(layout, top)
        writer = PyPDF2.PdfFileWriter()
        page.mediaBox.upperRight = (page.mediaBox[2], top)
        page.mediaBox.lowerLeft = (page.mediaBox[0], end_index - 10)
        get_horse_num(layout, page.mediaBox[1], page.mediaBox[3])
        writer.addPage(page)
        out_file = os.path.join(new_folder, race_date + race_num + '_' + horse_num + '.pdf')
        with open(out_file, 'wb') as outfp:
            writer.write(outfp)
        page_num += 1
Example #54
0
def find_kw(link, keyword, file_type='excel', parse=False) -> tuple:
    """
    judge the file type then inspection year
    :param link: files url
    :param keyword: keyword select
    :param file_type: csv, excel, ods, pdf
    :param parse: PDF file or IO stream ?
    :return: tuple
    """
    text = ''
    if file_type == 'ods':
        calc = pyexcel_ods.get_data(io.BytesIO(req.get(link).content))
        for sheet in list(calc.values()):
            for row in sheet:
                for cell in row:
                    cell_text = str(cell).strip().replace(' ', '')
                    if any(
                            cell_text.find(i) != -1
                            for i in ['時期', '月份', '民國']):
                        text = cell_text
                        if keyword in text:
                            return True, text
        return False, text

    elif file_type == 'csv':
        rows = csv.reader(
            io.BytesIO(req.get(link).content).read().decode('big5'))
        for i in rows:
            for j in i:
                s = str(j).strip().replace('\u3000', '').replace(' ', '')
                if any(s.find(i) != -1 for i in ['中華民國']):
                    text = s
                    if keyword in text:
                        return True, text
                    else:
                        return False, text

    elif file_type == 'pdf':
        if parse:
            parser = PDFParser(link)
        else:
            parser = PDFParser(io.BytesIO(req.get(link).content))

        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize()
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        text = ''
        for index, page in enumerate(list(doc.get_pages())):
            if keyword.find('月底') != -1 and index == 1:
                break
            interpreter.process_page(page)
            layout = device.get_result()
            for o in layout:
                if isinstance(o, LTTextBoxHorizontal):
                    pdf_text = o.get_text().strip()
                    if any(pdf_text.find(i) != -1 for i in ['時期', '年', '年期']):
                        text = pdf_text.split(' ')[-1]
                    if text.find(keyword) != -1:
                        return True, text
        return False, text

    elif file_type == 'stream':
        reader = codecs.getdecoder('utf8')(bytes(req.get(link).content))[0]
        text = reader[reader.index('('):reader.index(')')].strip('()')
        if text.find(keyword) != -1:
            return True, text
        else:
            return False, text

    wb = xlrd.open_workbook(file_contents=requests.get(link).content)
    sheet = wb.sheet_by_index(0)
    for i in range(sheet.nrows):
        for j in range(sheet.ncols):
            value = str(sheet.row_values(i)[j]).strip().replace(' ', '')
            if value.find('中華民國') != -1:
                text = value
                if keyword in value:
                    return True, text
    return False, text
Example #55
0
    def load_file_text(self, import_file):
        """ Import from file types of odt, docx pdf, epub, txt, html, htm.
        """

        text = ""

        # Import from odt
        if import_file[-4:].lower() == ".odt":
            text = self.convert_odt_to_text(import_file)
        # Import from docx
        if import_file[-5:].lower() == ".docx":
            #text = convert(importFile)  # uses docx_to_html
            document = opendocx(import_file)
            list_ = getdocumenttext(document)
            text = "\n".join(list_)
        # Import from epub
        if import_file[-5:].lower() == ".epub":
            book = epub.read_epub(import_file)
            for d in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
                #print(d.get_content())
                bytes_ = d.get_body_content()
                string = bytes_.decode('utf-8')
                text += html_to_text(string) + "\n"
        # import PDF
        if import_file[-4:].lower() == '.pdf':
            fp = open(import_file, 'rb')  # read binary mode
            parser = PDFParser(fp)
            doc = PDFDocument(parser=parser)
            parser.set_document(doc)
            # potential error with encrypted PDF
            rsrcmgr = PDFResourceManager()
            laparams = LAParams()
            laparams.char_margin = 1.0
            laparams.word_margin = 1.0
            device = PDFPageAggregator(rsrcmgr, laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.create_pages(doc):
                interpreter.process_page(page)
                layout = device.get_result()
                for lt_obj in layout:
                    if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                        text += lt_obj.get_text()
        # import from html
        if import_file[-5:].lower() == ".html" or import_file[-4:].lower() == ".htm":
            importErrors = 0
            with open(import_file, "r") as sourcefile:
                fileText = ""
                while 1:
                    line = sourcefile.readline()
                    if not line:
                        break
                    fileText += line
                text = html_to_text(fileText)
                QtWidgets.QMessageBox.warning(None, _('Warning'), str(importErrors) + _(" lines not imported"))
        # Try importing as a plain text file.
        if text == "":
            import_errors = 0
            try:
                with open(import_file, "r") as sourcefile:
                    while 1:
                        line = sourcefile.readline()
                        if not line:
                            break
                        try:
                            text += line
                        except Exception as e:
                            #logger.debug("Importing plain text file, line ignored: " + str(e))
                            import_errors += 1
                    if text[0:6] == "\ufeff":  # associated with notepad files
                        text = text[6:]
            except Exception as e:
                QtWidgets.QMessageBox.warning(None, _('Warning'),
                    _("Cannot import ") + str(import_file) + "\n" + str(e))
                return
            if import_errors > 0:
                QtWidgets.QMessageBox.warning(None, _('Warning'),
                    str(import_errors) + _(" lines not imported"))
                logger.warning(import_file + ": " + str(import_errors) + _(" lines not imported"))
        # import of text file did not work
        if text == "":
            QtWidgets.QMessageBox.warning(None, _('Warning'),
                _("Cannot import ") + str(import_file) + "\n" + str(e))
            return
        # Final checks: check for duplicated filename and update model, widget and database
        nameSplit = import_file.split("/")
        filename = nameSplit[-1]
        if any(d['name'] == filename for d in self.source):
            QtWidgets.QMessageBox.warning(None, _('Duplicate file'),
                _("Duplicate filename.\nFile not imported"))
            return
        entry = {'name': filename, 'id': -1, 'fulltext': text, 'mediapath': None, 'memo': "",
        'owner': self.settings['codername'], 'date': datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
        cur = self.settings['conn'].cursor()
        #logger.debug("type fulltext: " + str(type(entry['fulltext'])))
        cur.execute("insert into source(name,fulltext,mediapath,memo,owner,date) values(?,?,?,?,?,?)",
            (entry['name'],  entry['fulltext'], entry['mediapath'], entry['memo'], entry['owner'], entry['date']))
        self.settings['conn'].commit()
        cur.execute("select last_insert_rowid()")
        id_ = cur.fetchone()[0]
        entry['id'] = id_
        self.parent_textEdit.append(entry['name'] + _(" imported."))
        self.source.append(entry)
Example #56
0
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
# from pdfminer.pdfdevice import PDFDevice

# 获取需要读取的PDF文档对象
pdfFile = open("一种折叠屏上使用的付款码使用方式.pdf", "rb")
# 创建PDF解释器
pdfParser = PDFParser(pdfFile)
# 创建PDFDocument对象
pdfDoc = PDFDocument()
# 链接解释器和PDFDocument对象
pdfParser.set_document(pdfDoc)
pdfDoc.set_parser(pdfParser)
# 初始化文档,这里传入参数为PDF文档的密码
pdfDoc.initialize("")
# 创建PDF的资源管理器和参数分析器
pdfRes = PDFResourceManager()
pdfLaparams = LAParams()
# 创建聚合器和页面解释器
pdfDevice = PDFPageAggregator(pdfRes,laparams=pdfLaparams)
pdfInte = PDFPageInterpreter(pdfRes,pdfDevice)

for page in pdfDoc.get_pages():
    pdfInte.process_page(page)
    for out in pdfDevice.get_result():
        if hasattr(out,"get_text"):
            print(out.get_text())
#
# for (level,title,dest,a,se) in pdfDoc.get_outlines():
#     print(level,title,dest,a,se)

Example #57
0
class SvSpecParser():
    FONT_TRANSLATION = {  # "HEFBHG+TimesNewRomanPS-ItalicMT": "it",
        # "HEFBAE+TimesNewRomanPS-BoldMT": "b",
        'BVXWSQ+CourierNew,Bold': 'b',
        'BHDFJL+TimesNewRomanPSMT': None,
        'WTCCEL+TimesNewRoman,Italic': None,
        None: None
    }

    def __init__(self, ofile):
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        self.device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
        self.last_font = None
        self.in_rule = False
        self.font_print_pending = False
        self.ofile = ofile
        self.first_rule = True

    def parse_page(self, page):
        self.interpreter.process_page(page)
        layout = self.device.get_result()
        self.parse_obj(layout._objs)

    def collect_lines(self, o):
        if isinstance(o, LTTextBox):
            lines = [_o for _o in o._objs if isinstance(_o, LTTextLine)]
            yield from lines
        elif isinstance(o, LTFigure):
            yield from self.collect_lines(o._objs)

        return

    def parse_obj(self, objs):
        font_translation = self.FONT_TRANSLATION

        f = None
        tmp_lines = []
        for o in objs:
            tmp_lines.extend(self.collect_lines(o))

        tmp_lines.sort(key=lambda o: o.y0, reverse=True)
        tmp_lines = tmp_lines[2:-3]  # cut off header and footer

        for o in tmp_lines:
            text = o.get_text()
            # print(text)

            is_rule_header = "::=" in text
            if is_rule_header or self.in_rule:
                if is_rule_header:
                    if not self.first_rule:
                        self.ofile.write("\n</br>\n")
                    else:
                        self.first_rule = False

                # if is_rule_header and text.startswith("unary_module_path_operator"):
                #     print("----------")
                self.in_rule = True
                if not is_rule_header:
                    if text and o.x0 < 85:
                        self.in_rule = False
                        continue
                    self.ofile.write("    ")

                if text.strip():
                    for c in o._objs:
                        is_char = isinstance(c, LTChar)
                        if is_char:
                            if c.fontname == 'BHDEOM+Arial-BoldMT':
                                self.in_rule = False
                                # title
                                break
                        if (is_char and
                                c.matrix[-1] - o._objs[0].matrix[-1] > 3.5):
                            # sys.stderr.write(c.get_text())
                            # skipping hrefs, which are upper indexes
                            continue
                        if is_char and self.last_font != c.fontname:
                            # this character has different font need to propagate it to output
                            self.font_print_pending = True

                        if c.get_text().isspace() and font_translation[
                                self.last_font] is not None:
                            # print the font enclosing string directly after this word (ignore whitespaces behind)
                            self.font_print_pending = True
                            self.ofile.write("</%s>" % f)
                            self.last_font = None

                        if self.font_print_pending and not (
                                c.get_text().isspace()):
                            self.font_print_pending = False
                            f = font_translation[self.last_font]
                            if f:
                                self.ofile.write("</%s>" % f)
                            f = font_translation[c.fontname]
                            if f:
                                self.ofile.write("<%s>" % f)

                            self.last_font = c.fontname
                        # if text.startswith("list_of_port_declarations") and c.get_text() == "s":
                        #    print("----------")

                        self.ofile.write(c.get_text())
Example #58
0
def pdf_to_txt_miner(folder, password):
    #获取指定目录下的所有文件
    files = os.listdir(folder)
    pdfFiles = [f for f in files if f.endswith('.pdf')]

    #获取pdf类型的文件,放到一个列表中
    for pdfFile in pdfFiles:
        print(pdfFile)
        #将目录和文件合并成一个路径 os.path.join('root','test','runoob.txt')  ##root/test/runoob.txt
        pdfPath = os.path.join(folder, pdfFile)
        #设置将要转换后存放word文件的路径
        wdPath = os.path.join(txtpath, pdfFile)
        #判断是否已经存在对应的文件,如果不存在就加入到存放的路径中去
        if wdPath[-4:] != '.txt':
            wdPath = wdPath + '.txt'
        fn = open(path + "/{}".format(pdfFile), 'rb')
        #创建一个PDF文本档分析器:PDFParser
        parser = PDFParser(fn)
        #创建一个PDF文档:PDFDocumeng
        doc = PDFDocument()
        #链接分析器与文档
        parser.set_document(doc)
        doc.set_parser(parser)
        #提供出事话的密码,如果没有密码,输入空字符串
        doc.initialize('')
        #检测文档是否提供txt转换,不提供就直接忽略
        if not doc.is_extractable:
            print('PDFTextExtractionNotAllowed')
        else:
            #创建PDF资源管理器:PDFResourceManager
            resource = PDFResourceManager()
            #创建一个PDF参数分析器:;AParams
            laparams = LAParams()
            #创建聚合器,用于读取文档的对象:PDFPageAggregator
            device = PDFPageAggregator(resource, laparams=laparams)
            #创建解释器,对文档编码,解释成python能够识别的格式:PDFPageInterpreter
            interpreter = PDFPageInterpreter(resource, device)
            #doc.get_pages()是获取page列表的一个方法
            num_page, num_image, num_Text = 0, 0, 0
            num = 0
            for page in doc.get_pages():

                # num+=1;
                # if (num!=2):
                #     continue;
                pdf_str = ''
                #利用解释器的peocess_page()方法解析单独页数
                interpreter.process_page(page)
                layout = device.get_result()
                for out in layout:
                    if isinstance(out, LTTextBoxHorizontal):
                        num_Text += 1
                        # print(type(out.get_text()))
                        # if out.get_text().strip().replace(" ","")=="摘要":
                        #     print("找到摘要位置")
                        pdf_str += out.get_text().strip()
                        f = open(wdPath, 'a', encoding='utf-8')
                        f.write(out.get_text() + '\n')
                    if isinstance(out, LTImage):
                        num_image += 1
                        f = open(wdPath, 'a', encoding='utf-8')
                        f.write(out.get_text() + '\n')
Example #59
0
def cas_pdf_to_text(filename: Union[str, io.IOBase],
                    password) -> PartialCASData:
    """
    Parse CAS pdf and returns line data.

    :param filename: CAS pdf file (CAMS or Kfintech)
    :param password: CAS pdf password
    :return: array of lines from the CAS.
    """
    file_type: Optional[FileType] = None

    if isinstance(filename, str):
        fp = open(filename, "rb")
    elif hasattr(filename, "read") and hasattr(filename,
                                               "close"):  # file-like object
        fp = filename
    else:
        raise CASParseError(
            "Invalid input. filename should be a string or a file like object")

    with fp:
        pdf_parser = PDFParser(fp)
        try:
            document = PDFDocument(pdf_parser, password=password)
        except PDFPasswordIncorrect:
            raise CASParseError("Incorrect PDF password!")
        except PDFSyntaxError:
            raise CASParseError("Unhandled error while opening file")

        line_margin = {
            FileType.KFINTECH: 0.1,
            FileType.CAMS: 0.2
        }.get(detect_pdf_source(document), 0.2)

        rsrc_mgr = PDFResourceManager()
        laparams = LAParams(line_margin=line_margin, detect_vertical=True)
        device = PDFPageAggregator(rsrc_mgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrc_mgr, device)

        pages: List[Iterator[LTTextBoxHorizontal]] = []

        investor_info = None
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
            text_elements = filter(
                lambda x: isinstance(x, LTTextBoxHorizontal), layout)
            if file_type is None:
                for el in filter(lambda x: isinstance(x, LTTextBoxVertical),
                                 layout):
                    if re.search("CAMSCASWS", el.get_text()):
                        file_type = FileType.CAMS
                    if re.search("KFINCASWS", el.get_text()):
                        file_type = FileType.KFINTECH
            if investor_info is None:
                investor_info = parse_investor_info(layout, *page.mediabox[2:])
            pages.append(text_elements)

        lines = group_similar_rows(pages)
        return PartialCASData(file_type=file_type,
                              investor_info=investor_info,
                              lines=lines)
Example #60
0
def BatchPdfToTxt(filepath, newdir=''):
    # 文件路径切分为上级路径和文件名
    prapath, filename = os.path.split(filepath)
    if newdir == '':
        newdir = prapath
    else:
        newdir = newdir
    new_txt_name = TraversalFun.TranType(filename, "pdf2txt")
    if new_txt_name == None:
        return
    TraversalFun.mkdir(newdir)  # 创建目录

    try:
        newpath = os.path.join(newdir, new_txt_name)
        print("格式转换后保留路径:" + newpath)

        # 对文本进行处理
        fp = open(filepath, 'rb')  # 以二进制读模式打开
        praser = PDFParser(fp)  #用文件对象来创建一个pdf文档分析器
        doc = PDFDocument()  # 创建一个PDF文档
        praser.set_document(doc)  # 连接分析器 与文档对象
        doc.set_parser(praser)
        doc.initialize()  # 提供初始化密码,如果没有密码 就创建一个空的字符串

        # 检测文档是否提供txt转换,不提供就忽略
        if not doc.is_extractable:
            Global.error_file_list.append(filepath)
            return

        rsrcmgr = PDFResourceManager()  # 创建PDf 资源管理器 来管理共享资源
        laparams = LAParams()  # 创建一个PDF设备对象
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)  # 创建一个PDF解释器对象
        pdfStr = ""  # 定义存放结果的字符串
        # 循环遍历列表,每次处理一个page的内容
        for page in doc.get_pages():  # doc.get_pages() 获取page列表
            interpreter.process_page(page)
            layout = device.get_result()  # 接受该页面的LTPage对象
            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    pdfStr = pdfStr + x.get_text()
        #保存这些内容
        with open(newpath, 'wb') as f:
            f.write(pdfStr.encode())

        # 限制文件列表,默认限制5K以下
        filesize = os.path.getsize(newpath)
        # print(filesize)
        # print(Global.limit_file_size)
        if filesize < Global.limit_file_size:
            Global.limit_file_list.append(
                newpath + "\t\t" +
                str(Decimal(filesize / 1024).quantize(Decimal('0.00'))) + "KB")
            os.remove(newpath)
        else:
            Global.all_FileNum += 1
    except Exception as e:
        Global.error_file_list.append(filepath)
        return
    finally:
        pass