def getPageLayouts(f1):
    '''Takes a pdf file object, f1, extracts the text-like objects, and returns'''
    try:
        '''The parser and doc pair for a "pipe" of sorts'''
        with open(fpath, 'rb') as f1:
            parser = PDFParser(f1)
            doc = PDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
            doc.initialize(pss_wd)

            # can we extract text?
            if doc.is_extractable:
                rsrcmgr = PDFResourceManager()
                laparams = LAParams()
                device = PDFPageAggregator(rsrcmgr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)

                page_layouts = []
                for page in doc.get_pages():
                    '''
                    I *think* we're actually calling on fp here, and not some
                    stored data; the idea is that .pdf files are "too big and
                    complicated" to load all at once, so why not just parse
                    what you need when you need it?
                    '''
                    interpreter.process_page(page)
                    # receive the LTPage object for the page
                    page_layouts.append(device.get_result())
    except IOError:
        raise IOError, "issue with loading file, please try again"
    finally:
        f1.close()
        return page_layouts
Example #2
0
def pdfconvert(infullpath, file, infolder, pages=None):         #Handle PDF
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)
    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)
    pdffile = open(infullpath, 'rb')
   # print "pdffile=", pdffile
    for page in PDFPage.get_pages(pdffile, pagenums):
        interpreter.process_page(page)
    pdffile.close()
    converter.close()
    txtfilename = file
    jpgfile = infolder + str(txtfilename) + '.jpg'
    txtfile = corpuspath + corpusfolder + '/' + txtfilename + '.txt'

    text = output.getvalue()
    output.close
    temp = open(txtfile, 'w')
    temp.write (text)
    temp.close()

    imagemagick_string = 'convert ' + '"' + infullpath + '" "' + jpgfile + '"'
    os.system(imagemagick_string)

    return jpgfile
Example #3
0
 def ParseAllPages(self, filepath):
     # Open a PDF file.
     self.filepath = filepath
     fp = open(filepath, 'rb')
     # Create a PDF parser object associated with the file object.
     parser = PDFParser(fp)
     # Create a PDF document object that stores the document structure.
     doc = PDFDocument()
     # Connect the parser and document objects.
     parser.set_document(doc)
     doc.set_parser(parser)
     # Supply the password for initialization.
     # (If no password is set, give an empty string.)
     password = ""
     doc.initialize(password)
     # Check if the document allows text extraction. If not, abort.
     if not doc.is_extractable:
         raise PDFTextExtractionNotAllowed
     # Create a PDF resource manager object that stores shared resources.
     rsrcmgr = PDFResourceManager()
     # Create a PDF device object.
     device = PDFDevice(rsrcmgr)
     # Create a PDF interpreter object.
     interpreter = PDFPageInterpreter(rsrcmgr, device)
     # Process each page contained in the document.
     for page in doc.get_pages():
         interpreter.process_page(page)
Example #4
0
 def fix_text(self, filename):
     # Open a PDF file.
     pdfText = StringIO()
     fp = open(filename, 'rb')
     # Create a PDF parser object associated with the file object.
     parser = PDFParser(fp)
     # Create a PDF document object that stores the document structure.
     # Supply the password for initialization.
     if not self.password:
         document = PDFDocument(parser)
     else:
         document = PDFDocument(parser, self.password)
     # Check if the document allows text extraction. If not, abort.
     if not document.is_extractable:
         raise PDFTextExtractionNotAllowed
     # Create a PDF resource manager object that stores shared resources.
     rsrcmgr = PDFResourceManager()
     # Create a PDF device object.
     device = TextConverter(rsrcmgr, pdfText, codec=self.codec
             , laparams=LAParams(), imagewriter=None
             )
     # Create a PDF interpreter object.
     interpreter = PDFPageInterpreter(rsrcmgr, device)
     # Process each page contained in the document.
     for page in PDFPage.create_pages(document):
         interpreter.process_page(page)
     txt = pdfText.getvalue()
     return txt
def convert_pdf_to_txt(path):

    temp = os.path.splitext(path)

    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = "utf-8"
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, "rb")
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(
        fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True
    ):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()

    outputFile = temp[0] + ".txt"
    print outputFile

    ff = open(outputFile, "w")
    ff.write(text)
    ff.close()
Example #6
0
def convert(url, pages=None):
    assert isinstance(url, basestring)
    assert pages == None or isinstance(pages, list)

    rscmng = PDFResourceManager()
    retstr = StringIO()
    device = TextConverter(rscmng, retstr, codec='utf-8', laparams=LAParams())
    web_page = urllib2.urlopen(urllib2.Request(url))
    fp = StringIO(web_page.read())
    interpreter = PDFPageInterpreter(rscmng, device)

    pdf_pages = PDFPage.get_pages(
        fp,
        set(pages if pages != None else []),
        maxpages=0,
        password='',
        caching=True,
        check_extractable=True
    )

    for page in pdf_pages:
        interpreter.process_page(page)

    result = retstr.getvalue()

    fp.close()
    web_page.close()
    device.close()
    retstr.close()

    return result
def pdf_to_text(page_object):
    parser = PDFParser(page_object)
    # Create a PDF document object that stores the document structure
    doc = PDFDocument(parser)
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.initialize('')
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF page aggregator object
    device = PDFPageAggregator(rsrcmgr, laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    text_content = []
    # i = page number #without this it doesn't work
    # page are items in page
    for i, page in enumerate(PDFPage.create_pages(doc)):
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        for object in layout:
            if isinstance(object, LTTextBox) or isinstance(object, LTTextLine):
                trial = []
                trial.append(object.get_text())
                for word in trial:
                    text_content.append(word)                    
    return text_content
Example #8
0
def get_pdf_text(path):
    """ Reads a pdf file and returns a dict of the text where the
        index represents the page number.
        http://stackoverflow.com/a/20905381
    """
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    # change to to utf-8 if the text comes out garbled
    codec = 'ascii'
    #codec = 'utf-8'
    laparams = LAParams()
    pages = {}
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams, showpageno=True, pages=pages)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    retstr.close()
    return pages
Example #9
0
    def parse(self, path):
		out = StringIO.StringIO()
		fp = None
        # Directory
		if os.path.isdir(path):
			raise NotImplementedError()
        # File
	       	else:
			fp = file(path)		
		rsrc = PDFResourceManager()
		codec = 'utf-8'
		laparams = LAParams()
		laparams.char_margin = 2.0
		laparams.line_margin = 2.0
		laparams.word_margin = 0.0
		device = TextConverter(rsrc, out, codec=codec, laparams=laparams)
		doc = PDFDocument()
		parser = PDFParser(fp)
		parser.set_document(doc)
		doc.set_parser(parser)
		doc.initialize()
		interpreter = PDFPageInterpreter(rsrc, device)
		for page in doc.get_pages():
			interpreter.process_page(page)
		device.close()
		sample = Sample(path, None, out.getvalue())
		out.close()
		return sample
Example #10
0
    def parse_pdf_pdfminer(self, f, fpath):
        try:
            laparams = LAParams()
            laparams.all_texts = True  
            rsrcmgr = PDFResourceManager()
            pagenos = set()

            if self.dedup:
                self.dedup_store = set()

            self.handler.print_header(fpath)
            page_num = 0
            for page in PDFPage.get_pages(f, pagenos, check_extractable=True):
                page_num += 1

                retstr = StringIO()
                device = TextConverter(rsrcmgr, retstr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                interpreter.process_page(page)
                data = retstr.getvalue()
                retstr.close()

                self.parse_page(fpath, data, page_num)
            self.handler.print_footer(fpath)
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            self.handler.print_error(fpath, e)
Example #11
0
def convert_pdf_to_txt(path, output):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()

    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()
    fp.close()
    device.close()
    retstr.close()

    f = open(output, 'wb')
    f.write(text)
    f.close()
    return text
Example #12
0
    def load( self, open_file ):
        self.fields = {}
        self.text= {}

        # Create a PDF parser object associated with the file object.
        parser = PDFParser(open_file)
        # Create a PDF document object that stores the document structure.
        doc = PDFDocument()
        # Connect the parser and document objects.
        parser.set_document(doc)
        doc.set_parser(parser)
        # Supply the password for initialization.
        # (If no password is set, give an empty string.)
        doc.initialize('')
        # Check if the document allows text extraction. If not, abort.
        if not doc.is_extractable:
            raise PDFTextExtractionNotAllowed
        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()
        # Set parameters for analysis.
        laparams = LAParams()
        # Create a PDF page aggregator object.
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # Process each page contained in the document.
        for pgnum, page in enumerate( doc.get_pages() ):
            interpreter.process_page(page)
            if page.annots:
                self._build_annotations( page )
            txt= self._get_text( device )
            self.text[pgnum+1]= txt
Example #13
0
    def convert_pdf_to_txt(self, path):
        """
        A very simple conversion function
        which returns text for parsing from PDF.

        path = The path to the file
        """
        try:
            rsrcmgr = PDFResourceManager()
            retstr = StringIO()
            codec = 'utf-8'
            laparams = LAParams()
            device = TextConverter(
                rsrcmgr, retstr, codec=codec, laparams=laparams)
            fp = file(path, 'rb')
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            password = ""
            maxpages = 0
            caching = True
            pagenos = set()
            for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching,
                                          check_extractable=True):
                interpreter.process_page(page)
            text = retstr.getvalue()
            fp.close()
            device.close()
            retstr.close()
            return text
        except Exception as e:
            text = ""
            return text
            self.logger.error(
                "Failed to PDF to text: " + str(e))
Example #14
0
def pdf_to_txt(in_file):
	""" turn a PDF file to a TXT file (roughly processed)
	"""
	# Open a PDF file.
	fp = open(in_file, 'rb')
	# Create a PDF parser object associated with the file object.
	parser = PDFParser(fp)
	# Create a PDF document object that stores the document structure.
	document = PDFDocument(parser)
	# Check if the document allows text extraction. If not, abort.
	if not document.is_extractable:
		raise PDFTextExtractionNotAllowed
	# Set parameters for analysis.
	laparams = LAParams()
	# Create a PDF resource manager object that stores shared resources.
	rsrcmgr = PDFResourceManager()
	# Create a PDF page aggregator object.
	device = PDFPageAggregator(rsrcmgr, laparams=laparams)
	# Create a PDF interpreter object.
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	for page in PDFPage.create_pages(document):
		interpreter.process_page(page)
		# Receive the LTPage object for the page.
		layout = device.get_result()
		for klass in layout:
			if isinstance(klass, LTTextBoxHorizontal):
				out_file = in_file[:-3] + 'txt'
				with open(out_file, 'a') as dst_file:
					text = klass.get_text().encode('utf-8')
					dst_file.write(text + '\n')
	return None
Example #15
0
def convert_pdf_to_txt(path): 
	## TAKEN FROM STACK OVERFLOW
	## see... http://www.unixuser.org/~euske/python/pdfminer/programming.html for tutorial
	## Also see... https://github.com/dpapathanasiou/pdfminer-layout-scanner/blob/master/layout_scanner.py
	rsrcmgr = PDFResourceManager()
	retstr = StringIO()
	codec = 'utf-8'
	laparams = LAParams()

	fp = file(path, 'rb')
	password = ""
	maxpages = 0
	caching = True
	pagenos=set()

	# Read text from pages
	device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)	
	interpreter = PDFPageInterpreter(rsrcmgr, device)	
	for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
		interpreter.process_page(page)
	str = retstr.getvalue()

	fp.close()
	device.close()
	retstr.close()

	return str
Example #16
0
def convert_pdf_to_txt(path):
	rsrcmgr = PDFResourceManager()
	retstr = StringIO()
	codec = 'utf-8'
	laparams = LAParams()

	device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
	fp = file(path, 'rb')
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	password = ""
	maxpages = 120
	caching = True
	pagenos=set()
	# print "two"

	for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
		interpreter.process_page(page)
	# print "one"

	try:
		fp.close()
		device.close()
		str = retstr.getvalue()
		retstr.close()
	except:
		str = retstr.getvalue()

	return str
Example #17
0
def get_layout(path):
	'''returns a list of every character in the document as well as its location'''

	rsrcmgr = PDFResourceManager()
	retstr = StringIO()
	codec = 'utf-8'
	laparams = LAParams()

	fp = file(path, 'rb')
	password = ""
	maxpages = 0
	caching = True
	pagenos=set()

	layout = []
	device = PDFPageAggregator(rsrcmgr, laparams=laparams)
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
		interpreter.process_page(page)
		layout.append(  device.get_result()  )
	fp.close()
	device.close()
	retstr.close()

	return layout
Example #18
0
def parsePDF(pdf_file):

    pdf_file = open(pdf_file, "r").read()

    # Cast to StringIO object
    from StringIO import StringIO

    memory_file = StringIO(pdf_file)

    # Create a PDF parser object associated with the StringIO object
    parser = PDFParser(memory_file)

    # Create a PDF document object that stores the document structure
    document = PDFDocument(parser)

    # Define parameters to the PDF device objet
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    codec = "utf-8"

    # Create a PDF device object
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

    # Create a PDF interpreter object
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        data = retstr.getvalue()
        print data
        break
Example #19
0
    def run(path):
        print "Calling parser :%s" % path

        t0 = time.clock()

        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        fp = file(path, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()
        book = Book()
        i = 0
        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching,
                                      check_extractable=True):
            page_tmp = Page()
            begin_page = len(retstr.getvalue())
            interpreter.process_page(page)
            page_tmp.text = retstr.getvalue()[begin_page:-1]
            book.pages.append(page_tmp)
        fp.close()
        device.close()
        retstr.close()
        print "Parsing in:", time.clock() - t0
        return book
Example #20
0
def convert_pdf_to_txt(path):
    """
    Converts PDF to text using the pdfminer library
    """
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = "utf-8"
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    file_handle = file(path, "rb")
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(
        file_handle, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True
    ):
        interpreter.process_page(page)

    text = retstr.getvalue()

    file_handle.close()
    device.close()
    retstr.close()
    return text
Example #21
0
def pdf2xml(infile):
    '''
    Return a string of XML representation for given PDF file handle.
    Uses pdfminer to do the conversion and does some final post-processing.
    '''

    outfile = StringIO()

    # Empirically determined...
    laparams = LAParams()
    laparams.char_margin = 0.4

    # See pdf2txt.py
    rsrcmgr = PDFResourceManager(caching=False)
    device = XMLConverter(rsrcmgr, outfile, codec='utf-8', laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    if page_api:
        for page in PDFPage.get_pages(infile, set()):
            interpreter.process_page(page)
    else:
        process_pdf(rsrcmgr, device, infile, set())

    infile.close()
    return outfile.getvalue().replace("\n", "")
Example #22
0
def pdf_to_text(pdfname):
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.pdfpage import PDFPage
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams

    from cStringIO import StringIO

    # PDFMiner boilerplate
    rsrcmgr = PDFResourceManager()
    sio = StringIO()
    # codec = 'utf-8'
    codec = 'ascii'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, sio, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Extract text
    fp = file(pdfname, 'rb')
    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
    fp.close()

    # Get text from StringIO
    text = sio.getvalue()

    # Cleanup
    device.close()
    sio.close()

    return text
    def Parse(self):
        # 先看是否有 cache,以及日期是否夠新
        if not os.path.exists(parseCacheDir):
            os.makedirs(parseCacheDir)
        cacheFile = os.path.join(parseCacheDir, os.path.basename(self.pdfFileName) + '.cache')
        foundCache = (os.path.isfile(cacheFile) and \
                      os.path.getsize(cacheFile) > 0 and \
                      os.path.getmtime(cacheFile) > os.path.getmtime(self.pdfFileName))
        if (foundCache):
            fp = open(cacheFile, 'rb')
            self.RawData = pickle.load(fp)
            fp.close()
        else:
            fp = open(self.pdfFileName, 'rb')
            for page in PDFPage.get_pages(fp, None, maxpages=1):
                rsrcmgr = PDFResourceManager()
                device = PDFPageAggregator(rsrcmgr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                interpreter.process_page(page)
                layout = device.get_result()
                self.__readobj(layout._objs)
                for category in self.RawData.values():
                    self.__reverseYaxis(category, layout.bbox[3])
                cacheFp = open(cacheFile, 'wb')
                pickle.dump(self.RawData, cacheFp)
                cacheFp.close()
            fp.close()

        self.__calculateBoundary()
        self.__assignCharsAndLinesToCell()
        self.__processCells()
        return (self.effectiveFrom, self.__getResult())
Example #24
0
def process_pdf(rsrcmgr, device, fp, pagenums=None, maxpages=100, password=''):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument()
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.set_parser(parser)
    # Supply the document password for initialization.
    # (If no password is set, give an empty string.)
    doc.initialize(password)
    # Check if the document allows text extraction. If not, abort.
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    pages = dict(enumerate(doc.get_pages()))
    for num, page in pages.iteritems():
        if pagenums and (num not in pagenums):
            continue
        interpreter.process_page(page)
        if maxpages and maxpages <= num + 1:
            break
    return pages
Example #25
0
def pdf_read(pdf):
    """
    Use PDFMiner to extract text from pdf file.
    <PDFMiner even though more low-level but pretty good tool to read pdfs>

    Args:
        *pdf* (str) -- path to pdf file

    Returns:
        *text* (str) -- a text extracted from pdf

    """
    # initalizing objects
    res_manager = PDFResourceManager()
    strio = StringIO()
    lps = LAParams()
    device = TextConverter(res_manager, strio, codec='utf-8', laparams=lps)
    interpreter = PDFPageInterpreter(res_manager, device)
    # opening a pdf file with 'rb' mode for reading binary files
    pdf_file = file(pdf, 'rb')
    for page in PDFPage.get_pages(pdf_file, maxpages=0, password='',
                                  caching=True, check_extractable=True):
        interpreter.process_page(page)
    # finishing up
    pdf_file.close()
    device.close()
    text = strio.getvalue()
    strio.close()
    return text
Example #26
0
def extract_pdf(path, languages=None):
    """ Extract content from a PDF file. This will attempt to use PyPDF2
    to extract textual content first. If none is found, it'll send the file
    through OCR. """
    with open(path, 'rb') as fh:
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        parser = PDFParser(fh)
        doc = PDFDocument(parser, '')
        result = {'pages': []}
        if len(doc.info):
            for k, v in doc.info[-1].items():
                k = k.lower().strip()
                if k != 'pages':
                    result[k] = safe_text(v)

        if not doc.is_extractable:
            log.warning("PDF not extractable: %s", path)
            return result

        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)
            layout = device.get_result()
            text = _convert_page(layout, languages)
            result['pages'].append(text)
        device.close()
        return result
Example #27
0
    def extract_text(self):
        pdf_data = file(self.local_file, 'rb').read()
        pdf_stream = io.BytesIO(pdf_data)
        laparams = LAParams()
        resource_manager = PDFResourceManager(caching=True)
        output_type = 'text'
        codec = 'utf-8'
        output_stream = io.BytesIO()
        pagenos = set()

        device = TextConverter(
            resource_manager,
            output_stream,
            codec=codec,
            laparams=laparams,
        )

        interpreter = PDFPageInterpreter(
            resource_manager,
            device,
        )

        pages = PDFPage.get_pages(
            pdf_stream,
            pagenos,
            maxpages=0,
            caching=True,
            check_extractable=True,
        )

        for page in pages:
            interpreter.process_page(page)

        self.text = output_stream.getvalue().decode('utf8')
Example #28
0
def parse_pdf(pdf_url):

    remote_file = urllib.request.urlopen(pdf_url).read()
    memory_file = io.BytesIO(remote_file)
    parser = PDFParser(memory_file)
    doc = PDFDocument()
    parser.set_document(doc)
    #Warning sometimes, error in pdf?
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    ret = []
    # Process each page contained in the document.
    for pageIdx, page in enumerate(doc.get_pages()):
        ret.append([])
        interpreter.process_page(page)
        layout = device.get_result()
        for idx, lt_obj in enumerate(layout):
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                if len(lt_obj.get_text().strip()) > 0:
                    ret[pageIdx].append((lt_obj.get_text().splitlines()))
    return ret
def extract_text_from_pdf(pdf_filename):
    """
    Function to extract the text from pdf documents using pdfminer

    Parameters:
    -----------
    pdf_filename -- string
        File name of the pdf document as string

    Returns:
    --------
    extracted_text -- string
        Text extracted from pdf as string
    """

    resource_manager = PDFResourceManager()
    return_string = StringIO()
    la_params = LAParams()
    device = TextConverter(resource_manager, return_string, codec='utf-8', laparams=la_params)
    fp = file(pdf_filename, 'rb')
    interpreter = PDFPageInterpreter(resource_manager, device)
    page_nos = set()

    for page in PDFPage.get_pages(fp, page_nos):
        interpreter.process_page(page)
    fp.close()

    device.close()
    extracted_text = return_string.getvalue()
    return_string.close()

    return extracted_text
Example #30
0
def pdf_from_url_to_txt(url, maxpages=0):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    # Open the url provided as an argument to the function and read the content
    f = urllib2.urlopen(urllib2.Request(url)).read()
    # Cast to StringIO object
    fp = StringIO(f)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    caching = True
    pagenos = set()
    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    string = retstr.getvalue()
    retstr.close()
    return string
Example #31
0
def extract_text_from_pdf(pdf_path):
    '''
    Helper function to extract the plain text from .pdf files

    :param pdf_path: path to PDF file to be extracted (remote or local)
    :return: iterator of string of extracted text
    '''
    # https://www.blog.pythonlibrary.org/2018/05/03/exporting-data-from-pdfs-with-python/
    if not isinstance(pdf_path, io.BytesIO):
        # extract text from local pdf file
        with open(pdf_path, 'rb') as fh:
            try:
                for page in PDFPage.get_pages(
                        fh,
                        caching=True,
                        check_extractable=True
                ):
                    resource_manager = PDFResourceManager()
                    fake_file_handle = io.StringIO()
                    converter = TextConverter(
                        resource_manager,
                        fake_file_handle,
                        codec='utf-8',
                        laparams=LAParams()
                    )
                    page_interpreter = PDFPageInterpreter(
                        resource_manager,
                        converter
                    )
                    page_interpreter.process_page(page)

                    text = fake_file_handle.getvalue()
                    yield text

                    # close open handles
                    converter.close()
                    fake_file_handle.close()
            except PDFSyntaxError:
                return
    else:
        # extract text from remote pdf file
        try:
            for page in PDFPage.get_pages(
                    pdf_path,
                    caching=True,
                    check_extractable=True
            ):
                resource_manager = PDFResourceManager()
                fake_file_handle = io.StringIO()
                converter = TextConverter(
                    resource_manager,
                    fake_file_handle,
                    codec='utf-8',
                    laparams=LAParams()
                )
                page_interpreter = PDFPageInterpreter(
                    resource_manager,
                    converter
                )
                page_interpreter.process_page(page)

                text = fake_file_handle.getvalue()
                yield text

                # close open handles
                converter.close()
                fake_file_handle.close()
        except PDFSyntaxError:
            return
Example #32
0
def mine_area(filename):
    """
    use pdfminer to get the valid area of each page.
    all results are relative position!
    """

    pageboxlist = []

    # 打开一个pdf文件
    with open(filename, 'rb') as fp:
        # 创建一个PDF文档解析器对象
        parser = PDFParser(fp)
        # 创建一个PDF文档对象存储文档结构
        # 提供密码初始化,没有就不用传该参数
        #document = PDFDocument(parser, password)
        document = PDFDocument(parser)
        # 检查文件是否允许文本提取
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        # 创建一个PDF资源管理器对象来存储共享资源
        # caching = False不缓存
        rsrcmgr = PDFResourceManager(caching=False)
        # 创建一个PDF设备对象
        laparams = LAParams()
        # 创建一个PDF页面聚合对象
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解析器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # 处理文档当中的每个页面

        # doc.get_pages() 获取page列表
        # for i, page in enumerate(document.get_pages()):
        # PDFPage.create_pages(document) 获取page列表的另一种方式
        # 循环遍历列表,每次处理一个page的内容
        count = 0
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象。一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等
            boxlist = []
            for item in layout:
                if count >= 3:
                    break
                box = item.bbox
                boxlist.append(box)

                if isinstance(item, LTTextBox) or isinstance(item, LTTextLine):
                    print('text:{}'.format(item))
                    print(item.height)
                    print(item.get_text())
                    count += 1
                elif isinstance(item, LTImage):
                    print('image:{}'.format(item))
                elif isinstance(item, LTFigure):
                    print('figure:{}'.format(item))
                elif isinstance(item, LTAnno):
                    print('anno:{}'.format(item))
                elif isinstance(item, LTChar):
                    print('char:{}'.format(item))
                elif isinstance(item, LTLine):
                    print('line:{}'.format(item))
                elif isinstance(item, LTRect):
                    print('rect:{}'.format(item))
                elif isinstance(item, LTCurve):
                    print('curve:{}'.format(item))

            pageboxlist.append(boxlist)
            # for x in layout:
            #     #如果x是水平文本对象的话
            #     if (isinstance(x, LTTextBoxHorizontal)):
            #         # text=re.sub(replace,'',x.get_text())
            #         text = x.get_text()
            #         if len(text) != 0:
            #             print text

            break

    res = []
    for boxlist in pageboxlist:
        tmp = get_max_box(boxlist)
        res.append(tmp)
    return res
Example #33
0
class PdfParser(object):
    ''' basic CLI tool to extra info from a pdf,
        based on PDFMiner
        https://github.com/pdfminer/pdfminer.six '''
    ''' instantiate for given page or default for all page layouts '''
    def __init__(self, fp, pagenr=None):
        parser = PDFParser(fp)
        self.doc = PDFDocument(parser)
        laparams = LAParams()
        rsrcmgr = PDFResourceManager()
        self.device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
        pages = PDFPage.create_pages(self.doc)
        self.nrpages = 0
        self.pagelayouts = []
        # count all pages, but only store requested pagelayout (or all if pagenr==None)
        for p in pages:
            self.nrpages += 1
            # pagenr starts at 1, index in islice at 0
            if not pagenr or (pagenr and (pagenr == self.nrpages)):
                self.interpreter.process_page(p)
                layout = self.device.get_result()
                self.pagelayouts.append(layout)

    def nrofpages(self):
        return self.nrpages

    def getdocinfo(self):
        return self.doc.info[0]

    ''' GENERATOR for all LTTextLineHorizontal objects in all pagelayouts '''

    def txtlinegenerator(self):
        for pl in self.pagelayouts:
            for o in self.__txtlinegenerator_recursive(pl):
                yield o

    ''' actual recursive generator behind txtlinegenerator '''

    def __txtlinegenerator_recursive(self, obj):
        for o in obj:
            if isinstance(o, LTTextLineHorizontal):
                yield o
            else:
                try:
                    iterator = iter(o)
                except TypeError:
                    # not iterable
                    pass
                else:
                    yield from self.__txtlinegenerator_recursive(o)
        return

    ''' return all text objects where given search string is found '''

    def searchstr(self, searchstring):
        searchresult = []
        gen = self.txtlinegenerator()
        for txtboxobject in gen:
            if searchstring in txtboxobject.get_text():
                searchresult.append(txtboxobject)
        return searchresult

    ''' search all text objects within y0 in maxerr range from given yval '''

    def searchy(self, yval, maxerr):
        miny = yval - maxerr
        maxy = yval + maxerr
        searchresult = []
        gen = self.txtlinegenerator()
        for txtboxobject in gen:
            object_y0 = txtboxobject.y0
            if object_y0 > miny and object_y0 < maxy:
                searchresult.append(txtboxobject)
        return searchresult
Example #34
0
for bl in blacklisted:
    myRoutes[u"excluded_lines"].append(bl)

for i in getLines():
    pdf = download_pdf(i)
    if pdf == None:
        continue

    # Start pdfminer
    parser = PDFParser(io.BytesIO(pdf))
    document = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        layout = device.get_result()
        fieldNr = 0
        ref = u""
        name = u""
        origin = u""
        destination = u""
        wd_ida = []
        wd_volta = []
        sa_ida = []
        sa_volta = []
        su_ida = []
        su_volta = []
        for object in layout:
Example #35
0
def get_contents(filename):
  # found this online
  # outputs a single list of strings from a pdf
  rsrcmgr = PDFResourceManager()
  laparams = LAParams()
  device = PDFPageAggregator(rsrcmgr, laparams=laparams)
  interpreter = PDFPageInterpreter(rsrcmgr, device)
  pdf_file_instance = open(filename, 'rb')
  total_text = []
  
  # Boxes to look for info
  #xmin,xmax, ymin,ymax
  xy = [
    (325,335, 750,805), # commande_n
    (190,210, 550,564), # reference_n
    (31,32, 300,506), # tasks
    (500,515, 300,506), # prices
    (500,550, 145,216) # total ht, total ht net, total tva, net a payer
   ]
  #xy = [
  #  (325,335, 800,805), # commande_n
  #  (190,210, 560,564), # reference_n
  #  (31,32, 300,506), # tasks
  #  (509,515, 300,506), # prices
  #  (523,535, 145,216) # total ht, total ht net, total tva, net a payer
  # ]
   
  res = [' ']*len(xy)

  for page in PDFPage.get_pages(pdf_file_instance, maxpages=1):
      interpreter.process_page(page)
      layout = device.get_result()
      for lobj in layout:
        if isinstance(lobj, LTTextBox):
          x, y, text = lobj.bbox[0], lobj.bbox[3], lobj.get_text()
          #print('At %r is text: %s' % ((x, y), text))  
          if any(a <= x <= b and c <=y <=d for a,b,c,d in xy):
                    
            for idx, (a,b,c,d) in enumerate(xy):
              if a <= x <= b and c <=y <=d:
                res[idx] = '\n'.join([res[idx],text]).strip()
              #print('%r text: %s' % ((x, y), text))
                
            #print('At %r is text: %s' % ((x, y), text))
  pdf_file_instance.close()
           
  try:
    commande_n = res[0].split('\n')[-1]
  except:
    commande_n = ''
  try:
    reference_n = res[1]
  except:
    reference_n = ''
  try:
    tasks = res[2]
  except:
    tasks = ''
  try:
    prices = res[3]
  except:
    prices = ''
  try:
    total_ht,_,total_tva,total_ttc = res[4].split('\n')
    #note need to deal with total_ht.replace(u'\xa0', u'').
    #this should be done in the update_facture function

  except:
    total_ht,total_tva,total_ttc = '','',''

  
  return(commande_n, reference_n, total_ht, total_tva, total_ttc, tasks, prices)
Example #36
0
def parse(page):
    rsrcmgr = PDFResourceManager()
    laparams = LAParams(char_margin=4
                       ,word_margin=6
                       ,boxes_flow=1.5
                       ,line_margin=0.4)
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    def parseit(obj):
        res = [""]
        if isinstance(obj, LTChar):
            if isinstance(res[-1], str):
                res[-1] += obj.get_text()
            else:
                res.append(obj.get_text())
        elif isinstance(obj, LTTextBox) or\
                isinstance(obj, LTTextLine):
            res.append(obj.get_text())

        elif isinstance(obj, LTFigure) or isinstance(obj, LTPage):
            for subobj in obj:
                subpar = parseit(subobj)
                if isinstance(res[-1], str) and isinstance(subpar[0], str):
                    res = res[:-1] + [res[-1]+subpar[0]] + subpar[1:]
                else:
                    res = res + subpar
        elif isinstance(obj, LTImage):
            rawdata = obj.stream.get_rawdata()
            res += [("Image", obj.bbox, rawdata)]
        return res

    interpreter.process_page(page)
    layout = device.get_result()
    parsed = parseit(layout)
    parsed = list(filter(None, parsed))

    #return parsed
    itemcounter = 2

    res = {"items": list()
         , "nonitems": list()}
    images = list(filter(lambda item: item[0] is "Image",parsed))


    # treat first image as non-item
    if len(images) > 0:
        for img in images[0:1]:
            res["nonitems"].append({"image": img[2]
                                  , "name": itemcounter
                                  , "description": str(img[1])})
        itemcounter += 1

    # treat others as item-images
    if len(images) > 1:
        for img in images[1:]:
            res["items"].append({"image": img[2]
                               , "itemonpage": itemcounter
                               , "other": str(img[1])})
            itemcounter += 1


    res["items"].append({"itemonpage": 1, "other": "something"})
    return res
Example #37
0
def read_pdf(file,
             pages=[],
             laycntrl={},
             codec='utf-8',
             strip_control=False,
             password='',
             caching=True,
             maxpages=0,
             rotation=0,
             image_dir=''):
    """ Reads a file in pdf format.

    Use **pdfminer** to read a pdf-file into **Python**.

    Args:
        file (str): A string providing the location of the file.
        pages (list[int]): A list giving the numbers of the
            pages to be extracted, by default (default is `[]`) all
            pages are extracted.
        codec (str): A string giving the codec (default is 'utf-8').
        strip_control (bool): (default is `False`) not used in XML2Converter.
        password (str): A string giving the password (default is '').
        caching (bool): (default is `True`)
        maxpages (int): (default is `0`)
        rotation (int): (default is `0`)
        image_dir (str): (default is `''`)

    Returns:
        PdfDoc: An object of type `PdfDoc`.

    """
    if not (os.path.splitext(file)[1] == ".pdf"):
        raise IOError("PDF-file expected got '%s'!" %
                      (os.path.splitext(file)[1], ))

    if not os.path.exists(file):
        raise IOError("Could not find PDF-file '%s'!" % (file, ))

    if len(image_dir) == 0:
        imagewriter = None
    else:
        if not os.path.exists(image_dir):
            os.mkdir(image_dir)
        imagewriter = ImageWriter(image_dir)

    rsrcmgr = PDFResourceManager(caching=caching)
    laparams = LAParams(**laycntrl)

    device = XML2Converter(rsrcmgr,
                           codec=codec,
                           laparams=laparams,
                           imagewriter=imagewriter,
                           stripcontrol=strip_control)

    interpreter = PDFPageInterpreter(rsrcmgr, device)

    with open(file, 'rb') as con:
        if (pages is None) or (len(pages) == 0):
            pages = [i[0] for i in enumerate(PDFPage.get_pages(con))]

        for page in PDFPage.get_pages(con,
                                      pages,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            page.rotate = (page.rotate + rotation) % 360
            interpreter.process_page(page)

    return PdfDoc(device.doc)
Example #38
0
def main(argv):
    import getopt

    def usage():
        print(
            'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
            ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
            ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation] [-S]'
            ' [-t text|html|xml|tag] [-c codec] [-s scale]'
            ' file ...' % argv[0])
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:],
                                     'dp:m:P:o:CnAVM:L:W:F:Y:O:R:St:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-R': rotation = int(v)
        elif k == '-S': stripcontrol = True
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              codec=codec,
                              laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=stripcontrol)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter,
                               debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file(fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp,
                                      pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            page.rotate = (page.rotate + rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()
    outfp.close()
    return
Example #39
0
def csa_pdf(document):
    chaines = {  2752:'FRANCE INTER',
                    2746:'FRANCE INTER',
                    3054:'FRANCE INTER',
                    2845:'FRANCE INFO',
                    2314:'FRANCE CULTURE',
                    3116:'RADIO CLASSIQUE',
                    2152:'BFM',
                    3249:'RMC',
                    2481:'EUROPE 1',
                    2897:'RTL',
                    1575:'TF1',
                    6084:'FRANCE 2',
                    5694:'FRANCE 3',
                    1563:'FRANCE 3',
                    8237:'CANAL+',
                    18914:'FRANCE 5',
                    7541:'M6',
                    1857:'C8',
                    1818:'C8',
                    1931:'TMC',
                    2301:'BFMTV',
                    1872:'CNEWS',
                    1833:'LCI',
                    2329:'FRANCEINFO',
                    5937:'FRANCEINFO',
                    2460:'FRANCEINFO'}
    from pdfminer.layout import LAParams
    from pdfminer.pdfinterp import PDFResourceManager
    from pdfminer.pdfinterp import PDFPageInterpreter
    from pdfminer.converter import PDFPageAggregator
    from pdfminer.pdfpage import PDFPage
    from pdfminer.layout import LTTextBoxHorizontal,LTFigure,LTImage
    itvs = []

    #Create resource manager
    rsrcmgr = PDFResourceManager()
    # Set parameters for analysis.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    chaine = ""
    info = False
    for page in PDFPage.get_pages(document):
        #print "page-----------"
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        elts = dict(noms=[],orgs=[],durees=[])
        go = False
        col =""
        for element in layout:
            #print element
            if isinstance(element, LTFigure):
                for e in element:
                    if isinstance(e, LTImage):
                        if e.stream:
                            data = e.stream.get_rawdata()
                            chaine = chaines[len(data)]
                            #print len(data)
            if isinstance(element, LTTextBoxHorizontal):
                x = element.x1
                txt = element.get_text().strip()
                if 'TELEVISION' in txt or 'INTERVENTION' in txt or 'PROGRAMMES' in txt:
                    go = False
                if 'TELEVISIONS (AUTRES' in txt:
                    info = False
                if 'TELEVISIONS -' in txt:
                    info = True
                #print txt
                if txt[0:3]=='Du ':
                    date = (int(txt[9:13]),int(txt[6:8]))
                if txt in ['MAG','JT','PROG']:
                    typ = txt
                if go:
                    if x<seuils[0]:
                        col = 'noms'
                    elif x<seuils[1]:
                        col = 'orgs'
                    else:
                        col = 'durees'
                        txt = int(txt[0:2])*3600+int(txt[3:5])*60+int(txt[6:8])
                        #print "-->",txt
                    #print x,txt,col
                    elts[col].append((txt,date,typ))
                if txt==u'Dur\xe9e':
                    seuils = [250 if info else 220,500]
                    go = True
                elif txt==u'DUREE':
                    seuils = [260,480]
                    go = True

        if (len(elts['noms'])==len(elts['orgs']) or len(elts['noms'])==len(elts['durees'])):
            for i in range(len(elts['noms'])):
                itvs.append(dict(chaine=chaine,nom=elts['noms'][i][0],org=elts['orgs'][i][0],duree=elts['durees'][i][0],
                                 date=elts['durees'][i][1],
                                 type=elts['durees'][i][2]
                                 )
                            )
        else:
            return "boom"
            print len(elts['noms']),len(elts['orgs']),len(elts['durees'])
    return itvs
Example #40
0
def read_IDMP(fpath, **kwargs):
    '''
    This function will read an IDMP and return the DIDs that are expected to be
    obtained for the event.  There is no known way to obtain which TC these belong
    to, however, that will be addressed manually be some user.

    Input:
        fpath - The absolute file path to the IDMP pdf

    Kwargs:
        get_all - True: Returns all the DIDs
                  False:  Returns the DIDs that follow our criteria - no "pre" no "ee"
                  TODO-UPDATE THESE BECAUSE I CANNOT RECALL ALL OF THEM
        get_mapping - True: Returns the **** mapping ONLY
                      False: Returns the list of DIDs ONLY
    Returns:
        Returns a list of DIDs expected for the event
    '''

    get_all = kwargs.get('get_all', True)
    get_mapping = kwargs.get('get_mapping', False)
    lines = []
    DIDs = []
    mapdict = {'OSF': {}, 'TPY': {}, 'THAAD': {}}
    ladd = lines.append
    if os.path.isfile(fpath) and os.path.splitext(
            fpath)[-1] == '.pdf' and 'IDMP' in fpath:
        file_content = open(fpath, 'rb')

        parser = PDFParser(file_content)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        #I changed the following 2 parameters to get rid of white spaces inside words:
        laparams.char_margin = 1.0
        laparams.word_margin = 1.0
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        first_data_items = 0
        last_data_items = 0
        # Process each page contained in the document.
        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBox) or isinstance(
                        lt_obj, LTTextLine):
                    newlines = lt_obj.get_text().splitlines()
                    for i, l in enumerate(newlines):
                        if l.lower().startswith('(u) index of data items'
                                                ) and first_data_items == 0:
                            first_data_items = i + len(lines) - 1
                        if l.lower().startswith('1.0 (u) introduction') and first_data_items\
                            and len(lines) - 1 + i > first_data_items:
                            last_data_items = i + len(lines) - 1
                        ladd(l)
            if first_data_items and last_data_items:
                break
        if first_data_items and last_data_items:
            RELEVANT_TEXT = lines[first_data_items:last_data_items]
            #This part parses the DID names only
            DIDs = list(map(lambda x: x.split(' ')[0], RELEVANT_TEXT))
            if get_all:
                DIDs = [d for d in DIDs if '-' in d and 'gti' not in d.lower()]
            else:
                DIDs = [
                    d for d in DIDs if '-' in d and 'gti' not in d.lower() and
                    not '-pre-' in d.lower() and not d.lower().startswith('ee')
                ]
            #This part maps the first two numbers after the '-' in the DID to the numbers in the ()
            for line in lines:
                DID = line.split(' ')[0]
                if '-' in DID and 'gti' not in DID.lower():
                    if '(' in line and ')...' in line and 'osf-' in DID.lower(
                    ) and '-pre-' not in DID.lower():
                        value = line.split('(')[-1].split(')')[0]
                        if value[-1] in CapAlphabet:
                            value = value[:-1]
                        key = DID.split('-')[-1][:-2]
                        mapdict['OSF'][key] = value
                    if '(' in line and ')...' in line and (
                            'tpy2-' in DID.lower() or 'typ2-'
                            in DID.lower()) and '-pre-' not in DID.lower():
                        value = line.split('(')[-1].split(')')[0]
                        key = DID.split('-')[-1]
                        if key[-1] in CapAlphabet:
                            key = key[-1]
                        mapdict['TPY'][key] = value
                    if '(' in line and ')...' in line and 'thaad' in DID.lower(
                    ) and '-pre-' not in DID.lower():
                        value = line.split('(')[-1].split(')')[0]
                        value = 'String' + value.split('String')[-1]
                        key = DID.split('-')[-1][:2]
                        mapdict['THAAD'][key] = value
        else:
            print(
                'Could not find the relevant information needed to parse..  Returning []'
            )
        file_content.close()
    else:
        print(''''The IDMP path given failed one or more of the following:\n
              1: not a valid file\n
              2: not a pdf\n
              3: the file does not contain the string 'IDMP'. ''')
    if not get_mapping:
        return DIDs
    else:
        return mapdict
Example #41
0
class GetPic:
    def __init__(self, filename, password=''):
        """
        初始化
        :param filename: pdf路径
        :param password: 密码
        """
        with open(filename, 'rb') as file:
            # 创建文档分析器
            self.parser = PDFParser(file)
        # 创建文档
        self.doc = PDFDocument()
        # 连接文档与文档分析器
        self.parser.set_document(self.doc)
        self.doc.set_parser(self.parser)
        # 初始化, 提供初始密码, 若无则为空字符串
        self.doc.initialize(password)
        # 检测文档是否提供txt转换, 不提供就忽略, 抛出异常
        if not self.doc.is_extractable:
            raise PDFTextExtractionNotAllowed
        else:
            # 创建PDF资源管理器, 管理共享资源
            self.resource_manager = PDFResourceManager()
            # 创建一个PDF设备对象
            self.laparams = LAParams()
            self.device = PDFPageAggregator(self.resource_manager,
                                            laparams=self.laparams)
            # 创建一个PDF解释器对象
            self.interpreter = PDFPageInterpreter(self.resource_manager,
                                                  self.device)
            # pdf的page对象列表
            self.doc_pdfs = list(self.doc.get_pages())
        #  打开PDF文件, 生成一个包含图片doc对象的可迭代对象
        self.doc_pics = fitz.open(filename)

    def to_pic(self, doc, zoom, pg, pic_path):
        """
        将单页pdf转换为pic
        :param doc: 图片的doc对象
        :param zoom: 图片缩放比例, type int, 数值越大分辨率越高
        :param pg: 对象在doc_pics中的索引
        :param pic_path: 图片保存路径
        :return: 图片的路径
        """
        rotate = int(0)
        trans = fitz.Matrix(zoom, zoom).preRotate(rotate)
        pm = doc.getPixmap(matrix=trans, alpha=False)
        path = os.path.join(pic_path, str(pg)) + '.png'
        pm.writePNG(path)
        return path

    def get_pic_loc(self, doc, tmp=''):
        """
        获取单页中图片的位置,输出文本
        :param doc: pdf的doc对象
        :return: 返回一个list, 元素为图片名称和上下y坐标元组组成的tuple. 当前页的尺寸
        """
        self.interpreter.process_page(doc)
        layout = self.device.get_result()
        # pdf的尺寸, tuple, (width, height)
        canvas_size = layout.bbox
        # 图片名称坐标
        loc_top = []
        # 来源坐标
        loc_bottom = []
        # 图片名称与应截取的区域y1, y2坐标
        loc_named_pic = []
        # 遍历单页的所有LT对象
        text_export = ''
        # 输出文本信息

        topNumber = 0
        bottomNumber = 0

        for i in layout:
            # print('读取变量数据',i)
            if hasattr(i, 'get_text'):
                text = i.get_text().strip()
                text_export += text
                # 匹配关键词
                if re.search(r'^(图表*|表)(\s|\s*\d|\s*[::])', text):
                    loc_top.append((i.bbox, text))
                    topNumber = topNumber + 1
                elif re.search(r'^\n*((来源)|(资料来源)|(数据来源))(\s|[::])', text):
                    bottomNumber = bottomNumber + 1
                    loc_bottom.append((i.bbox, text))

        locname = []

        print('读取一页的结果:topNumber.', topNumber, '     bottomNumber.',
              bottomNumber)

        i0 = 0
        j0 = 0
        #        size_increase = 10 #

        name = ''
        print(loc_top)
        print(loc_bottom)
        print(len(loc_top), len(loc_bottom))
        # 这里逻辑有点乱。将loc_top和loc_bottom依y轴坐标对齐,以找出

        if len(loc_top) == 1 and len(loc_bottom) == 0:
            try:
                name = locname[0][0][1]
            except:
                name = ''
        elif len(loc_top) > 0 and len(loc_bottom) > 0:
            while i0 <= len(loc_top) - 1 and j0 <= len(loc_bottom) - 1:
                #  print (i0,j0)
                if loc_top[i0][0][1] < loc_bottom[j0][0][
                        1]:  # 如果尾的y轴坐标值大于头的坐标值(y轴坐标由下往上递增  范围为0到正无穷)
                    bottom = [(0, loc_bottom[j0][0][1], canvas_size[2],
                               loc_bottom[j0][0][3]), loc_bottom[j0][1]]
                    locname.append([bottom, 1])
                    j0 += 1
                    continue
                is_binglie = 0  # 判定是否一行两个图
                try:
                    if abs(loc_top[i0][0][1] -
                           loc_top[i0 + 1][0][1]) < 10:  # 纵坐标相差不大
                        is_binglie = 1
                except:
                    pass

                if is_binglie == 0:
                    if loc_top[i0][0][1] > loc_bottom[j0][0][
                            1]:  # 非并列时,最正常的上下关系图情况
                        top = [(0, loc_top[i0][0][1], canvas_size[2],
                                loc_top[i0][0][3]),
                               loc_top[i0][1]]  # (x1,y1,x2,y2)
                        locname.append([top, 0])
                        i0 += 1
                    else:
                        bottom = [(0, loc_bottom[j0][0][1], canvas_size[2],
                                   loc_bottom[j0][0][3]), loc_bottom[j0][1]]
                        locname.append([bottom, 1])
                        j0 += 1

                else:
                    is_binglie_laiyuan = 0
                    try:
                        if abs(loc_bottom[j0][0][1] -
                               loc_bottom[j0 + 1][0][1]) < 10:
                            is_binglie_laiyuan = 2
                        else:
                            is_binglie_laiyuan = 1
                    except:
                        try:
                            loc_bottom[j0][0][1]
                            is_binglie_laiyuan = 1
                        except:
                            is_binglie_laiyuan = 0

                    if is_binglie_laiyuan == 2:
                        top1 = [(0, loc_top[i0][0][1], loc_top[i0 + 1][0][0],
                                 loc_top[i0][0][3]), loc_top[i0][1]]
                        locname.append([top1, 0])

                        bottom1 = [
                            (0, loc_bottom[j0][0][1], loc_top[i0 + 1][0][0],
                             loc_bottom[j0][0][3]), loc_bottom[j0][1]
                        ]
                        locname.append([bottom1, 1])

                        top2 = [(loc_top[i0 + 1][0][0], loc_top[i0 + 1][0][1],
                                 canvas_size[2], loc_top[i0 + 1][0][3]),
                                loc_top[i0 + 1][1]]
                        locname.append([top2, 0])

                        bottom2 = [(loc_top[i0 + 1][0][0],
                                    loc_bottom[j0 + 1][0][1], canvas_size[2],
                                    loc_bottom[j0 + 1][0][3]),
                                   loc_bottom[j0 + 1][1]]
                        locname.append([bottom2, 1])
                        i0 += 2
                        j0 += 2
                    elif is_binglie_laiyuan == 1:

                        top1 = [(0, loc_top[i0][0][1], loc_top[i0 + 1][0][0],
                                 loc_top[i0][0][3]), loc_top[i0][1]]
                        locname.append([top1, 0])

                        bottom1 = [
                            (0, loc_bottom[j0][0][1], loc_top[i0 + 1][0][0],
                             loc_bottom[j0][0][3]), loc_bottom[j0][1]
                        ]
                        locname.append([bottom1, 1])

                        top2 = [(loc_top[i0 + 1][0][0], loc_top[i0 + 1][0][1],
                                 canvas_size[2], loc_top[i0 + 1][0][3]),
                                loc_top[i0 + 1][1]]
                        locname.append([top2, 0])

                        bottom2 = [(loc_top[i0 + 1][0][0],
                                    loc_bottom[j0][0][1], canvas_size[2],
                                    loc_bottom[j0][0][3]), loc_bottom[j0][1]]
                        locname.append([bottom2, 1])
                        i0 += 2
                        j0 += 1
                    else:
                        top1 = [(0, loc_top[i0][0][1], loc_top[i0 + 1][0][0],
                                 loc_top[i0][0][3]), loc_top[i0][1]]
                        top2 = [(loc_top[i0 + 1][0][0], loc_top[i0 + 1][0][1],
                                 canvas_size[2], loc_top[i0 + 1][0][3]),
                                loc_top[i0 + 1][1]]
                        locname.append([top1, 0])
                        locname.append([top2, 0])
                        i0 += 2

            if i0 == len(loc_top):
                while j0 <= len(loc_bottom) - 1:
                    locname.append([loc_bottom[j0], 1])
                    j0 += 1
            if j0 == len(loc_bottom):
                while i0 <= len(loc_top) - 1:
                    locname.append([loc_top[i0], 0])
                    i0 += 1

            if i0 == len(loc_top):
                while j0 <= len(loc_bottom) - 1:
                    locname.append([loc_bottom[j0], 1])
                    j0 += 1
            if j0 == len(loc_bottom):
                while i0 <= len(loc_top) - 1:
                    locname.append([loc_top[i0], 0])
                    i0 += 1
            k = 0
            loc_named_pic = []
            #  print(locname)
            '''
            将locname转为loc_named_pic
            '''
            while k <= len(locname) - 1:
                #   print(k)
                if locname[0][1] == 1:  # 第一行是表尾,定义x1,x2为pdf宽度,y1为pdf顶,y2为表尾坐标
                    x1 = canvas_size[0]
                    x2 = canvas_size[2]
                    y1 = canvas_size[3]
                    y2 = locname[0][0][0][3]
                    name = tmp
                    loc_named_pic.append([name, (x1, y1, x2, y2)])
                    name = ''
                    k += 1

                elif locname[k][1] == 0:  # 找到第一个表头
                    name += locname[k][0][1]
                    if k + 1 < len(locname):  # k 是表头行
                        ii = k + 1
                        while ii < len(locname):  ##ii 找表尾
                            if locname[ii][1] == 0:  ## ii不是表尾
                                name += ' ' + locname[ii][0][1]
                                ii += 1
                            else:  ## ii是表尾
                                x1 = locname[k][0][0][0]
                                x2 = locname[k][0][0][2]
                                y1 = locname[k][0][0][3]
                                y2 = locname[ii][0][0][1]
                                loc_named_pic.append([name, (x1, y1, x2, y2)])
                                name = ''
                                k = ii + 1
                                ii += 1
                                continue
                        k += 1
                    else:
                        k += 1
                else:
                    k += 1

        tmp = name

        return loc_named_pic, canvas_size, tmp, topNumber, bottomNumber

    def get_crops(self, pic_path, canvas_size, position, cropped_pic_name,
                  cropped_pic_path):
        """
        按给定位置截取图片
        :param pic_path: 被截取的图片的路径
        :param canvas_size: 图片为pdf时的尺寸, tuple, (0, 0, width, height)
        :param position: 要截取的位置, tuple, (y1, y2)
        :param cropped_pic_name: 截取的图片名称
        :param cropped_pic_path: 截取的图片保存路径
        :return:
        """
        img = Image.open(pic_path)
        # 当前图片的尺寸 tuple(width, height)
        pic_size = img.size
        # 截图的范围扩大值

        count = 0
        size_increase = 10
        ##没改完
        x1 = max(position[0] - size_increase,
                 0) * (pic_size[0] / canvas_size[2])
        x2 = min(position[2] + size_increase,
                 canvas_size[2]) * (pic_size[0] / canvas_size[2])
        #  y1 = pic_size[1] * (1 - (position[0] + size_increase)/canvas_size[3])
        #  y2 = pic_size[1] * (1 - (position[1] - size_increase)/canvas_size[3])
        y1 = max(0, (1 - (position[1] + size_increase) / canvas_size[3]) *
                 pic_size[1])
        y2 = min(pic_size[1],
                 (1 - (position[3] - size_increase) / canvas_size[3]) *
                 pic_size[1])

        #  print(x1,x2,y1,y2)
        cropped_img = img.crop((x1, y1, x2, y2))
        cropped_pic_name = cropped_pic_name + str(count)
        cropped_pic_name = cropped_pic_name.replace('/', '')
        cropped_pic_name = cropped_pic_name.replace('  ', '')
        cropped_pic_name = cropped_pic_name.replace('\\', '')
        cropped_pic_name = cropped_pic_name.replace(':', '')
        cropped_pic_name = cropped_pic_name.replace('*', '')
        cropped_pic_name = cropped_pic_name.replace('?', '')
        cropped_pic_name = cropped_pic_name.replace('"', '')
        cropped_pic_name = cropped_pic_name.replace('<', '')
        cropped_pic_name = cropped_pic_name.replace('>', '')
        cropped_pic_name = cropped_pic_name.replace('|', '')
        cropped_pic_name = cropped_pic_name.replace('\n', '')
        cropped_pic_name = cropped_pic_name.replace('\r', '')
        cropped_pic_name = cropped_pic_name.replace('\f', '')
        if len(cropped_pic_name) > 50:
            cropped_pic_name = cropped_pic_name[0:49]
        count += 1
        rand0 = str(random.randint(10000000, 99999999))
        text0 = []
        log0 = []
        try:
            path = os.path.join(cropped_pic_path, rand0) + '.png'
            cropped_img.save(path)
            text0 = cropped_pic_name + '|' + rand0 + '|' + str(x1) + '|' + str(
                x2) + '|' + str(y1) + '|' + str(y2)
            # print(text0)
            return text0, log0

        # print('成功截取图片:', cropped_pic_name)
        except:
            log0 = cropped_pic_path + cropped_pic_name
            print('失败', cropped_pic_name)
            return text0, log0
            # pass

    def main(self, pic_path, cropped_pic_path, pgn=None, tmp=''):
        """
        主函数
        :param pic_path: 被截取的图片路径
        :param cropped_pic_path: 图片的截图的保存路径
        :param pgn: 指定获取截图的对象的索引
        :return:
        """
        text_total = []
        log_total = []
        topNumber = 0
        bottomNumber = 0
        if pgn is not None:
            # 获取当前页的doc
            doc_pdf = self.doc_pdfs[pgn]
            doc_pic = self.doc_pics[pgn]
            # 将当前页转换为PNG, 返回值为图片路径
            path = self.to_pic(doc_pic, 2, pgn, pic_path)
            loc_name_pic, canvas_size, tmp, topNumber, bottomNumber = self.get_pic_loc(
                doc_pdf, tmp=tmp)

            print(pgn)

            if loc_name_pic:
                for i in loc_name_pic:
                    position = i[1]
                    cropped_pic_name = re.sub('/', '_', i[0])
                    text1, log1 = self.get_crops(path, canvas_size, position,
                                                 cropped_pic_name,
                                                 cropped_pic_path)
                    if text1:
                        text1 = text1 + '|' + str(pgn)
                        text_total.append(text1)

                        ##写入文件

                    if log1:
                        log1 = log1 + '|第' + str(pgn) + '页出错'
                        log_total.append(log1)
        return tmp, text_total, log_total, topNumber, bottomNumber
Example #42
0
#链接解释器和文档对象
parser.set_document(doc)
doc.set_parser(parser)

#初始化文档
doc.initialize("")

#创建PDF资源管理器
resoure = PDFResourceManager()

#参数分析器
laparam = LAParams()

#创建聚合器
device = PDFPageAggregator(resoure, laparams=laparam)

#创建页面解释器
interpreter = PDFPageInterpreter(resoure, device)

#使用文档对象得到页面的集合
for page in doc.get_pages():
    #使用页面解释起来读取
    interpreter.process_page(page)

    #使用聚合器获得内容
    layout = device.get_result()

    for out in layout:
        if hasattr(out, "get_text"):
            print(out.get_text())
Example #43
0
class SvSpecParser():
    FONT_TRANSLATION = {  # "HEFBHG+TimesNewRomanPS-ItalicMT": "it",
        # "HEFBAE+TimesNewRomanPS-BoldMT": "b",
        'BVXWSQ+CourierNew,Bold': 'b',
        'BHDFJL+TimesNewRomanPSMT': None,
        'WTCCEL+TimesNewRoman,Italic': None,
        None: None
    }

    def __init__(self, ofile):
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        self.device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
        self.last_font = None
        self.in_rule = False
        self.font_print_pending = False
        self.ofile = ofile
        self.first_rule = True

    def parse_page(self, page):
        self.interpreter.process_page(page)
        layout = self.device.get_result()
        self.parse_obj(layout._objs)

    def collect_lines(self, o):
        if isinstance(o, LTTextBox):
            lines = [_o for _o in o._objs if isinstance(_o, LTTextLine)]
            yield from lines
        elif isinstance(o, LTFigure):
            yield from self.collect_lines(o._objs)

        return

    def parse_obj(self, objs):
        font_translation = self.FONT_TRANSLATION

        f = None
        tmp_lines = []
        for o in objs:
            tmp_lines.extend(self.collect_lines(o))

        tmp_lines.sort(key=lambda o: o.y0, reverse=True)
        tmp_lines = tmp_lines[2:-3]  # cut off header and footer

        for o in tmp_lines:
            text = o.get_text()
            # print(text)

            is_rule_header = "::=" in text
            if is_rule_header or self.in_rule:
                if is_rule_header:
                    if not self.first_rule:
                        self.ofile.write("\n</br>\n")
                    else:
                        self.first_rule = False

                # if is_rule_header and text.startswith("unary_module_path_operator"):
                #     print("----------")
                self.in_rule = True
                if not is_rule_header:
                    if text and o.x0 < 85:
                        self.in_rule = False
                        continue
                    self.ofile.write("    ")

                if text.strip():
                    for c in o._objs:
                        is_char = isinstance(c, LTChar)
                        if is_char:
                            if c.fontname == 'BHDEOM+Arial-BoldMT':
                                self.in_rule = False
                                # title
                                break
                        if (is_char and
                                c.matrix[-1] - o._objs[0].matrix[-1] > 3.5):
                            # sys.stderr.write(c.get_text())
                            # skipping hrefs, which are upper indexes
                            continue
                        if is_char and self.last_font != c.fontname:
                            # this character has different font need to propagate it to output
                            self.font_print_pending = True

                        if c.get_text().isspace() and font_translation[
                                self.last_font] is not None:
                            # print the font enclosing string directly after this word (ignore whitespaces behind)
                            self.font_print_pending = True
                            self.ofile.write("</%s>" % f)
                            self.last_font = None

                        if self.font_print_pending and not (
                                c.get_text().isspace()):
                            self.font_print_pending = False
                            f = font_translation[self.last_font]
                            if f:
                                self.ofile.write("</%s>" % f)
                            f = font_translation[c.fontname]
                            if f:
                                self.ofile.write("<%s>" % f)

                            self.last_font = c.fontname
                        # if text.startswith("list_of_port_declarations") and c.get_text() == "s":
                        #    print("----------")

                        self.ofile.write(c.get_text())
Example #44
0
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTFigure, LTImage, LTChar

a=raw_input('enter the file name:\n')
if len(a)<1: a='HB.pdf'
fp=open(a,'rb')
parser=PDFParser(fp)
doc=PDFDocument(parser)
rs=PDFResourceManager()

lapara=LAParams()
device=PDFPageAggregator(rs, laparams=lapara)
inte=PDFPageInterpreter(rs, device)

name=None
ID=None
state=None
phone=None
a=[]

for page in PDFPage.create_pages(doc):
    inte.process_page(page)
    layout=device.get_result()
    count=0
    # pdb.set_trace()
    state_index = None
    phone_index = None
    for x in layout:
Example #45
0
 def parse_page_box(pdf_file_path, line_overlap=0.2, char_margin=0.1,
                    line_margin=0.2, word_margin=0.1, boxes_flow=0.5,
                    detect_vertical=False, all_texts=False):
     """
     创建一个PDF文档分析器
     创建一个PDF文档对象存储文档结构
     检查文件是否允许文本提取
     创建一个PDF资源管理器对象来存储共赏资源
     设定参数进行分析
     创建一个PDF设备对象
     创建一个PDF解释器对象
     处理每一页
     :param pdf_file_path:
     :param line_overlap:
     :param word_margin:
     :param line_margin:
     :param char_margin:
     :param boxes_flow:
     :param detect_vertical:
     :param all_texts:
     :return:
     """
     fp = open(pdf_file_path, 'rb')
     parser = PDFParser(fp)
     document = PDFDocument(parser)
     if not document.is_extractable:
         raise PDFTextExtractionNotAllowed
     else:
         resources_manger = PDFResourceManager()
         la_params = LAParams(line_overlap=line_overlap,
                              detect_vertical=detect_vertical,
                              all_texts=all_texts,
                              word_margin=word_margin,
                              line_margin=line_margin,
                              char_margin=char_margin,
                              boxes_flow=boxes_flow)
         device = PDFPageAggregator(resources_manger, laparams=la_params)
         interpreter = PDFPageInterpreter(resources_manger, device)
         page_no = 0
         page_box_list = list()
         for page in PDFPage.create_pages(document):
             interpreter.process_page(page)
             layout = device.get_result()
             page_box = PDFTools.create_page_box()
             page_box.reset_page_box(start_x=layout.x0, start_y=layout.y0,
                                     end_x=layout.x1, end_y=layout.y1,
                                     page_no=page_no)
             for box in layout:
                 if isinstance(box, LTTextBoxHorizontal):
                     content = box.get_text().strip(u'\n ')
                     if content == u'':
                         continue
                     text_box = PDFTools.create_text_box()
                     font_dict = PDFTools.get_font_dict(box=box)
                     text_box.reset_text_box(start_x=box.x0, start_y=box.y0,
                                             end_x=box.x1, end_y=box.y1,
                                             content=content,
                                             font_dict=font_dict)
                     page_box.add_text_box(text_box=text_box)
             page_box_list.append(page_box)
             page_no += 1
         return page_box_list
Example #46
0
def main(argv):
    global Verbose_Flag
    global Use_local_time_for_output_flag
    global testing

    argp = argparse.ArgumentParser(description="extract_pseudo_JSON-from_PDF.py: Extract the pseudo JSON from the end of the thesis PDF file")

    argp.add_argument('-v', '--verbose', required=False,
                      default=False,
                      action="store_true",
                      help="Print lots of output to stdout")

    argp.add_argument('-t', '--testing',
                      default=False,
                      action="store_true",
                      help="execute test code"
                      )

    argp.add_argument('-p', '--pdf',
                      type=str,
                      default="test.pdf",
                      help="read PDF file"
                      )

    argp.add_argument('-j', '--json',
                      type=str,
                      default="calendar_event.json",
                      help="JSON file for extracted calendar event"
                      )

    argp.add_argument('-a', '--acronyms',
                      type=str,
                      default="acronyms.tex",
                      help="acronyms filename"
                      )

    argp.add_argument('-l', '--ligatures',
                      default=False,
                      action="store_true",
                      help="leave ligatures rahter than replace them"
                      )



    args = vars(argp.parse_args(argv))

    Verbose_Flag=args["verbose"]

    filename=args["pdf"]
    if Verbose_Flag:
        print("filename={}".format(filename))

    #output_string = StringIO()
    output_string = BytesIO()
    with open(filename, 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        #device = HTMLConverter(rsrcmgr, output_string, laparams=LAParams(), layoutmode='normal', codec='utf-8')

        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)

        text=output_string.getvalue().decode('UTF-8')
        if Verbose_Flag:
            print("text: {}".format(text))

    # define the maker string
    quad__euro_marker='€€€€'

    # look for the new start of the For DiVA information
    diva_start=text.find("{0} For DIVA {0}".format(quad__euro_marker))
    if diva_start < 0:
        # if not found, then try the older For DIVA string
        diva_start=text.find("For DIVA")

    if Verbose_Flag:
        print("For DIVA found at diva_start={}".format(diva_start))
    if diva_start >= 0:
        diva_data=text[:]
        diva_data=diva_data[diva_start:]
        diva_start=diva_data.find("{")
        if diva_start >= 0:
            diva_data=diva_data[diva_start:]
            end_block=diva_data.find('”Number of lang instances”:') # note these are right double quote marks
            if end_block < 0:            
                end_block=diva_data.find('"Number of lang instances":') # note these are straight double quote marks
            if end_block > 0:
                end_block=diva_data.find(',', end_block)
                if end_block > 0:
                    dict_string=diva_data[:]
                    dict_string=dict_string[:end_block]+'}'

                    dict_string=dict_string.replace('', '') #  remove any new page characters
                    dict_string=dict_string.replace('”', '"')
                    dict_string=dict_string.replace('\n\n', '\n')
                    dict_string=dict_string.replace(' \n', '')
                    dict_string=dict_string.replace(',}', '}')

                    dict_string=dict_string.replace('”', '"')
                    #dict_string=dict_string.replace('&quot;', '"')
                    #dict_string=dict_string.replace('<br>', '\n')
                    #dict_string=dict_string.replace('<br>"', '\n"')
                    #dict_string=dict_string.replace('<br>}', '\n}')
                    dict_string=dict_string.replace(',\n\n}', '\n}')
                    dict_string=dict_string.replace(',\n}', '\n}')

                    # fix an error in the early template
                    if dict_string.find(',Äddress": ') > 0:
                        print("fix an error in the early template")
                        dict_string=dict_string.replace(',Äddress": ', ',"Address": "')
                        dict_string=dict_string.replace('\"Lindstedtsvägen', 'Lindstedtsvägen')
                        dict_string=dict_string.replace('¨Lindstedtsvägen', 'Lindstedtsvägen')
                        dict_string=dict_string.replace('¨Isafjordsgatan', 'Isafjordsgatan')



                    if not args['ligatures']:
                        dict_string=replace_ligature(dict_string)
                        print("looking for and replacing ligatures")

                    if Verbose_Flag:
                        print("dict_string={}".format(dict_string))
                    print("dict_string={}".format(dict_string))
                    d=json.loads(dict_string)
                    if Verbose_Flag:
                        print("d={}".format(d))

                    abs_keywords=diva_data[(end_block+1):]
                    abs_keywords=abs_keywords.replace('', '')
                    if Verbose_Flag:
                        print("abs_keywords={}".format(abs_keywords))
                    number_of_quad_euros=abs_keywords.count(quad__euro_marker)
                    if Verbose_Flag:
                        print("number_of_quad_euros={}".format(number_of_quad_euros))
                    abstracts=dict()
                    keywords=dict()
                    if (number_of_quad_euros % 2) == 1:
                        print("Odd number of markers")

                    save_abs_keywords=abs_keywords[:]

                    number_of_pairs_of_markers=int(number_of_quad_euros/2)
                    for i in range(0, number_of_pairs_of_markers):
                        abstract_key_prefix='”Abstract['
                        key_offset=abs_keywords.find(abstract_key_prefix)
                        if key_offset > 0:
                            # found a key for an abstract
                            # get language code
                            lang_code_start=key_offset+len(abstract_key_prefix)
                            lang_code=abs_keywords[lang_code_start:lang_code_start+3]
                            quad__euro_marker_start=abs_keywords.find(quad__euro_marker, lang_code_start)
                            if quad__euro_marker_start >= 0:
                                quad__euro_marker_end=abs_keywords.find(quad__euro_marker, quad__euro_marker_start + 5)
                                abstracts[lang_code]=abs_keywords[quad__euro_marker_start+5:quad__euro_marker_end]
                                #br_offset=abstracts[lang_code].find('<br>')
                                #if br_offset >= 0:
                                #    abstracts[lang_code]=abstracts[lang_code][br_offset+4:]

                                abs_keywords=abs_keywords[quad__euro_marker_end+1:]
                        

                    abs_keywords=save_abs_keywords[:]

                    for i in range(0, number_of_pairs_of_markers):
                        abstract_key_prefix='”Keywords['
                        key_offset=abs_keywords.find(abstract_key_prefix)
                        if key_offset > 0:
                            # found a key for an abstract
                            # get language code
                            lang_code_start=key_offset+len(abstract_key_prefix)
                            lang_code=abs_keywords[lang_code_start:lang_code_start+3]
                            quad__euro_marker_start=abs_keywords.find(quad__euro_marker, lang_code_start)
                            if quad__euro_marker_start > 0:
                                quad__euro_marker_end=abs_keywords.find(quad__euro_marker, quad__euro_marker_start + 5)
                                keywords[lang_code]=abs_keywords[quad__euro_marker_start+5:quad__euro_marker_end]
                                keywords[lang_code]=keywords[lang_code].replace('\n', '') # remove newlines from keywords
                                keywords[lang_code]=keywords[lang_code].strip() # remove starting end ending white space
                                br_offset=keywords[lang_code].find('<br>')
                                if br_offset >= 0:
                                    keywords[lang_code]=keywords[lang_code][br_offset+4:]
                                abs_keywords=abs_keywords[quad__euro_marker_end+1:]
                        

                    for a in abstracts:
                        print("a={0}, abstract={1}".format(a, abstracts[a]))
                        abstracts[a]=clean_up_abstract(abstracts[a])

                    any_acronyms_in_abstracts=False
                    for a in abstracts:
                        acronyms_present=check_for_acronyms(abstracts[a])
                        if acronyms_present:
                            any_acronyms_in_abstracts=True

                    if any_acronyms_in_abstracts:
                        acronyms_filename=args["acronyms"]
                        print("Acronyms found, getting acronyms from {}".format(acronyms_filename))
                        acronym_dict=get_acronyms(acronyms_filename)
                        if len(acronym_dict) == 0:
                            print("no acronyms found in {}".format(acronyms_filename))
                        else:
                            # entries of the form: acronym_dict[label]={'acronym': acronym, 'phrase': phrase}
                            for a in abstracts:
                                abstracts[a]=spellout_acronyms_in_abstract(acronym_dict, abstracts[a])


                    print("abstracts={}".format(abstracts))
                    print("keywords={}".format(keywords))

                    d['abstracts']=abstracts
                    d['keywords']=keywords
                    output_filename=args["json"]
                    if Verbose_Flag:
                        print("output_filename={}".format(output_filename))
                    with open(output_filename, 'w', encoding='utf-8') as output_FH:
                        j_as_string = json.dumps(d, ensure_ascii=False)
                        print(j_as_string, file=output_FH)

            else:
                print('No "Number of lang instances" found')
                dict_string=diva_data[:]
                print("initial dict_string={}".format(dict_string))

                dict_string=dict_string.replace('', '') #  remove any new page characters

                dict_string=dict_string.replace('”', '"')
                dict_string=dict_string.replace('\n\n', '\n')
                dict_string=dict_string.replace(' \n', '')
                dict_string=dict_string.replace(',}', '}')

                #dict_string=dict_string.replace('&quot;', '"')
                #dict_string=dict_string.replace('<br>', '\n')
                #dict_string=dict_string.replace('<br>"', '\n"')
                #dict_string=dict_string.replace('<br>}', '\n}')
                dict_string=dict_string.replace(',\n\n}', '\n}')
                dict_string=dict_string.replace(',\n}', '\n}')
                # fix an error in the early template
                if dict_string.find(',Äddress": ') > 0:
                    print("fix an error in the early template")
                    dict_string=dict_string.replace(',Äddress": ', ',"Address": "')
                    dict_string=dict_string.replace('\"Lindstedtsvägen', 'Lindstedtsvägen')
                    dict_string=dict_string.replace('¨Lindstedtsvägen', 'Lindstedtsvägen')
                    dict_string=dict_string.replace('¨Isafjordsgatan', 'Isafjordsgatan')

                if not args['ligatures']:
                    dict_string=replace_ligature(dict_string)
                    print("looking for and replacing ligatures")

                print("dict_string={}".format(dict_string))
                d=json.loads(dict_string)
                print("d={}".format(d))

                output_filename=args["json"]
                if Verbose_Flag:
                    print("output_filename={}".format(output_filename))
                with open(output_filename, 'w', encoding='utf-8') as output_FH:
                    j_as_string = json.dumps(d, ensure_ascii=False)
                    print(j_as_string, file=output_FH)
Example #47
0
def parse(file_name):
    """ Exact pdf file into text format and calling above methods to detect useful student information
        Method description:
        1. Exact pdf file and transform into text file by using pdfminer library functions
        2. Used above methods to detect and store the useful information in the list: uesful
        
        return: "useful", a list contains all the useful informaiton of student
        
        raises: none know bugs
        
        """
    
    fp = open(file_name, 'rb') #Open the file and read as binary mode;
    #Created pdf parser object to associate original pdf file
    praser = PDFParser(fp)
    # Created a blank PDF file to store useful information;
    doc = PDFDocument()
    #Connected parse object and doc.pdf
    praser.set_document(doc)
    doc.set_parser(praser)
    useful = [] # This is the empty list to store incoming useful student information
    #Initialize our empty doc
    doc.initialize()


    #To test if doc.pdf can be transformed as text format
    #If not, stopped;
    #Else, continue.
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        #Created PDFRecourceManager to manage the shared resource
        rsrcmgr = PDFResourceManager()
        #Created PDF device object to store interpreted format of data
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    
        #Create PDF interpreter object to transform shared informaiton in the rsrcmgr and stored in device ....
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        #Use for loop to go through the file, and unit is page number
        #Initial page number is 0
        page_number = 0
        
        temp_use = []
        temp_dict = {
                    "name":"",
                    "LASID":"",
                    "DOB":"",
                    "Grade":"",
                    "RD":"",
                    "School":"",
                    "District":"",
                    "Score":"",
                    "Score_level":"",
                    "low_top":"",
                    "course":"",
                } 
        for page in doc.get_pages(): # doc.get_pages() to get page lists information
            interpreter.process_page(page)
            # To accept interpreted page LTPage object
            layout = device.get_result()
            #Layout means for every LTPage, which stores interpreted instance of corresponding pages, such asLTTextBox, LTFigure, LTImage, LTTextBoxHorizontal. If we need to capture strings, then it should be txt instance.
            read_flag = 0# this variable means the pages have been read
            for x in layout: #for every layout
                if (isinstance(x, LTTextBoxHorizontal)):# if the instance type of layout is LTTextBoxHorizontal
                    results = x.get_text()# we store all the txt in the results
                    print (results)
                    if page_number%2 == 0 and read_flag==0: # This is the even page number and no previous page has been read
                        #temp_use.append(results)
                        temp_dict["course"] = results.split("\n")[0]
                        read_flag = 1
                        continue
                    else:# if odd pages or there is previous page has been read
                        #  Continue getting name, LASID, DOB, etc...
                        if get_name(results):
                            temp_dict["name"] = get_name(results) 
                        if get_LASID(results):
                            temp_dict["LASID"] = get_LASID(results)
                        if get_DOB(results):
                            temp_dict["DOB"] = get_DOB(results)
                        if get_Grade(results):
                            temp_dict["Grade"] = get_Grade(results)
                        if get_RD(results):
                            temp_dict["RD"] = get_RD(results)
                        if get_School(results):
                            temp_dict["School"] = get_School(results)
                        if get_District(results):
                            temp_dict["District"] = get_District(results)
                        if get_Score(results):
                            temp_dict["Score"] = get_Score(results)
                        if get_Score_level(results):
                            temp_dict["Score_level"] = get_Score_level(results)
                        if get_low_top(results):
                            temp_dict["low_top"] = get_low_top(results)
                    #print (temp_dict)
                    #input("==")
            #page_number += 1
            #print (page_number)
            #if page_number%2 == 0:
            if 1:  # When page number == 1;
                useful.append(temp_dict)
                temp_dict = {
                    "name":"",
                    "LASID":"",
                    "DOB":"",
                    "Grade":"",
                    "RD":"",
                    "School":"",
                    "District":"",
                    "Score":"",
                    "Score_level":"",
                    "low_top":"",   
                } 
    return useful
Example #48
0
    if not file_name.endswith(".pdf"):
        continue
    f = open(os.path.join(os.getcwd(), file_name), "rb")
    parser = PDFParser(f)
    document = PDFDocument(parser, "")

    if not document.is_extractable:
        print("PERMISSION DENIED!!!!")
        continue
    
    rsmanager = PDFResourceManager()
    outfile = file_name[:len(file_name)-4]+".txt"
    outfp = open(outfile, 'w', encoding='utf-8')
    laparams = LAParams()
    device = TextConverter(rsmanager, outfp, laparams=laparams,imagewriter=None)
    intrptr = PDFPageInterpreter(rsmanager, device)
    for page in PDFPage.get_pages(f):
        intrptr.process_page(page)
    device.close()
    outfp.close()
    f.close()


searching.find()

#os.system('python file.py')




from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter, HTMLConverter, XMLConverter
from pdfminer.layout import LAParams
import io

path = "B:\\Alien Brain\\Python Warm-Up\\Test\\2.pdf"
pdf = open(path, 'rb')

mem = io.StringIO()
lp = LAParams()
rm = PDFResourceManager()
#cnv = TextConverter(rm,mem,laparams=lp)
#cnv = HTMLConverter(rm,mem,laparams=lp)
cnv = XMLConverter(rm, mem, laparams=lp)
ip = PDFPageInterpreter(rm, cnv)

for i in PDFPage.get_pages(pdf):
    ip.process_page(i)
    text = mem.getvalue()

file = open(path + "Converted.xml", 'wb')
file.write(text.encode('utf-8'))

print("DONE")
Example #50
0
def purgeextract(infilename):
    Report = open(infilename, 'rb')

    outlist = list()
    #setup
    parser = PDFParser(Report)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    #Set parameters
    laparams = LAParams()
    laparams.char_margin = 1.0
    laparams.word_margin = 1.0
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    #extracting text from PDF
    extracted_text = ''
    for page in doc.get_pages():
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                extracted_text += lt_obj.get_text()
    texttolist = extracted_text.split('\n')
    
    #Extracting
    found = 0 #Using this to check if the pdf file had any purge values before output
    for i in range (len(texttolist)):
        if found <2:
            if texttolist[i] == 'Type of calibration: Ion transfer (pos): Optimize C-Trap Entrance Lens --- Inject # (V)':
                stop = 0    
                y = 0
                for y in range(len(texttolist)-(i+1)):#location depends on the various calibrations - have to look through it all.

                    if 'result:' in texttolist[i+y].split() and stop == 0:
                        try:
                            outlist.append(['Entrance', texttolist[i+y].split('->')[0].split()[-1],texttolist[i+y].split('->')[1]])
                        except(IndexError):
                            outlist.append('indexissue')
                            
                        stop = 1 #Stopping this loop after it found the result - otherwise it might catch the exit value as well.
                found +=1
                
                
                
            if texttolist[i] == 'Type of calibration: HCD Transfer: Optimize C-Trap Exit Lens --- Purge # (V)':
                y=0
                stop = 0    
                for y in range(len(texttolist)-(i+1)):#Apparently it differs how far away the results are

                    if 'result:' in texttolist[i+y].split() and stop == 0:
                        try:
                            outlist.append(['Exit', texttolist[i+y].split('->')[0].split()[-1],texttolist[i+y].split('->')[1]])
                        except(IndexError):
                            outlist.append('indexissue')
                        stop = 1
                found +=1
            #if found == 2:
                #i = len(texttolist)
    if found != 0:            
        Report.close()
        return(outlist)
    else: 
        Report.close()
        return(0)
Example #51
0
def decode_pdf(filename):
    global current_section
    global pre_section
    global pre_font_family
    global pre_font_size
    global title
    global authors
    global abstract
    global keywords

    current_section = ""
    pre_section = TAG_BEGIN
    pre_font_family = ""
    pre_font_size = ""
    title = ""
    authors = set()
    abstract = ""
    keywords = ""

    path = basedir + "/static/demos/paperminer/papers/" + filename
    # layout parameters
    laparams = LAParams()
    caching = True
    rsrcmgr = PDFResourceManager(caching=caching)
    outtype = 'html'
    out = StringIO()
    # Opens a file for reading only in binary format. The file pointer is
    # placed at the beginning of the file. This is the default mode.
    fp = file(path, 'rb')

    # parse PDF to HTML
    codec = 'utf-8'
    if outtype == 'text':
        device = TextConverter(rsrcmgr,
                               out,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=None)
    if outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              out,
                              codec=codec,
                              laparams=laparams,
                              imagewriter=None)
    if outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               out,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=None)

    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    pagenos = set()
    # only process the first page
    max_page = 1
    p = 0
    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=max_page,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        if p >= max_page:
            break
        interpreter.process_page(page)
    fp.close()
    device.close()
    # str_value is the first PDF page in HTML
    str_value = out.getvalue()
    out.close()

    # loop through each line in HTML
    for line in str_value.split('<br>'):
        analyze(line)
    result = [
        title.decode('utf-8'), authors,
        abstract.decode('utf-8'),
        keywords.decode('utf-8')
    ]

    return result
Example #52
0
    def fetch(self) -> Iterator[Journal]:
        resp = requests.get(self.url)
        resp.raise_for_status()

        content_length = resp.headers.get("Content-Length", None)
        content_length = int(
            content_length) if content_length is not None else None
        cached_file = cache_in_memory(resp, size=content_length)
        with pdfplumber.open(cached_file) as pdf:

            def get_entries() -> Iterator[str]:
                from pdfplumber.utils import cluster_objects, extract_text, DEFAULT_X_TOLERANCE
                import unicodedata

                # fontname_regex = re.compile(r"([A-Z]{6})\+([A-Za-z]+)(\d+)?")
                small_font_size_threshold = Decimal("8.0")

                def is_font_bold(char: PDFChar) -> bool:
                    tag, fontname = char["fontname"].split("+")
                    return "BX" in fontname

                def is_font_small(char: PDFChar) -> bool:
                    return char["size"] < small_font_size_threshold

                def normalize_char(
                        char: PDFChar,
                        interpreter: PDFPageInterpreter) -> Optional[PDFChar]:
                    text = char["text"]
                    if len(text) > 1 and (
                            cid_match :=
                            cid_regex.fullmatch(text)) is not None:
                        cid = int(cid_match.group(1))
                        text = cmap_char(cid, char["fontname"], interpreter)
                        if text is None:
                            char["text"] = None
                            return char

                    ntext = unicodedata.normalize("NFKC", text)
                    if len(ntext) == 2 and unicodedata.combining(ntext[1]):
                        text = ntext[1]

                    text = make_combining_form(text) or text
                    if is_font_small(char):
                        if text == "o":
                            text = "°"

                    char["text"] = text
                    return char

                def sort_line_chars(
                        chars: Sequence[PDFChar],
                        interpreter: PDFPageInterpreter) -> Sequence[PDFChar]:
                    chars = (normalize_char(char, interpreter)
                             for char in chars)
                    chars = sorted(chars, key=lambda char: char["x0"])
                    main_chars, combining_chars = partition(
                        lambda char: char["text"] and unicodedata.combining(
                            char["text"]), chars)
                    combining_chars_iter = peekable(iter(combining_chars))
                    for main_char in main_chars:
                        yield main_char

                        while combining_chars_iter:
                            combining_char = combining_chars_iter.peek()

                            overlap = max(
                                min(main_char["x1"], combining_char["x1"]) -
                                max(main_char["x0"], combining_char["x0"]), 0)
                            if overlap < main_char["width"] * Decimal("0.5"):
                                break

                            yield combining_char
                            next(combining_chars_iter, None)

                    assert (next(combining_chars_iter, None) is None)

                    return
                    yield

                x_tolerance = Decimal("3.0")
                y_tolerance = Decimal("3.0")
                min_tab_width = Decimal("8.0")

                for page in pdf.pages:
                    device = PDFPageAggregator(
                        pdf.rsrcmgr,
                        pageno=page.page_number,
                        laparams=pdf.laparams,
                    )
                    interpreter = PDFPageInterpreter(pdf.rsrcmgr, device)
                    interpreter.process_page(page.page_obj)

                    contents = page.crop(
                        (
                            Decimal(100),
                            Decimal(70 +
                                    (200 if page.page_number == 1 else 0)),
                            page.width - Decimal(100),
                            page.height - Decimal(70),
                        ),
                        relative=False,
                    )
                    left_column = contents.crop(
                        (
                            Decimal(0),
                            Decimal(0),
                            contents.width * Decimal(0.5),
                            contents.height,
                        ),
                        relative=True,
                    )
                    right_column = contents.crop(
                        (
                            contents.width * Decimal(0.5),
                            Decimal(0),
                            contents.width,
                            contents.height,
                        ),
                        relative=True,
                    )

                    for column in (left_column, right_column):
                        bold_chars = filter(is_font_bold, column.chars)
                        bold_char_lines = cluster_objects(
                            bold_chars, "top", y_tolerance)
                        bold_line_y0s = (min(char["top"] for char in line)
                                         for line in bold_char_lines)

                        hsep_y0s = chain(bold_line_y0s, (column.bbox[3], ))
                        hsep_y0s = list(hsep_y0s)
                        for y0, y1 in windowed(hsep_y0s, 2):
                            if y1 is None:
                                break
                            entry = column.within_bbox(
                                (
                                    column.bbox[0],
                                    max(y0 - y_tolerance, column.bbox[1]),
                                    column.bbox[2],
                                    min(y1 + y_tolerance, column.bbox[3]),
                                ),
                                relative=False,
                            )

                            entry_lines = cluster_objects(
                                entry.chars, "top", y_tolerance)
                            entry_text = StringIO()

                            # TODO: refactor into separate top-level function, along with sort_line_chars, normalize_char.
                            for line_chars in entry_lines:
                                line_chars = list(line_chars)
                                last_char: Optional[PDFChar] = None
                                for char in sort_line_chars(
                                        line_chars, interpreter):
                                    if last_char is not None and last_char[
                                            "text"] is not None:
                                        if char["x0"] > last_char[
                                                "x1"] + min_tab_width:
                                            entry_text.write("\t")
                                        elif char["x0"] > last_char[
                                                "x1"] + x_tolerance:
                                            entry_text.write(" ")

                                    if char["text"] is not None:
                                        entry_text.write(char["text"])
                                        if not unicodedata.combining(
                                                char["text"]):
                                            last_char = char

                                entry_text.write("\n")

                            yield unicodedata.normalize(
                                "NFKC", entry_text.getvalue())

                return
                yield

            journal = Journal()
            for entry in get_entries():
                print(f"ENTRY: {entry}")
                # TODO: handle `[name in other language]` bits.
                pass

            if journal is not None and journal.names and journal.iso4:
                yield journal
Example #53
0
def parse_ndpdf(pdf_path):
    fp = open(pdf_path, "rb")
    # 用文件对象创建一个pdf文档分析器
    parse_pdf = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument()
    parse_pdf.set_document(doc)
    doc.set_parser(parse_pdf)
    doc.initialize()
    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF参数分析器
        laparams = LAParams()
        # 创建聚合器
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF页面解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # 循环遍历列表,每次处理一页的内容
        # doc.get_pages() 获取page列表
        for page in doc.get_pages():
            # 使用页面解释器来读取
            interpreter.process_page(page)
            # 使用聚合器获取内容
            layout = device.get_result()
            results_last = ""
            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
            a = 0
            gd = []
            zj = []
            hm = []
            xingzhi = []
            bili = []
            guoji = []

            for out in layout:
                # 判断是否含有get_text()方法,图片之类的就没有
                # if hasattr(out,"get_text"):
                a += 1
                if isinstance(out, LTTextBoxHorizontal):
                    results = out.get_text()
                    # 解析亏损表
                    if a == 1:
                        if results != "A106000企业所得税弥补亏损明细表\n" and results != "中华人民共和国企业所得税年度纳税申报表(A类)\n" and results != "A000000企业基础信息表\n":
                            break
                        else:
                            biaoge = results
                            gd = False
                    # print(results)
                    # results_last = results
                    if biaoge == "A106000企业所得税弥补亏损明细表\n" and results_last == '前五年度\n前四年度\n前三年度\n前二年度\n前一年度\n本年度\n可结转以后年度弥补的亏损额合计\n':
                        nf = results.strip("").split("\n")
                        print(nf)
                    if biaoge == "A106000企业所得税弥补亏损明细表\n":
                        if results_last == '2\n' or results_last == "2011\n2012\n2013\n2014\n2015\n2016\n":
                            nstzhs = results.strip("").split("\n")
                            if len(nstzhs) == 7:
                                nstzhsd = nstzhs
                                print(nstzhsd)
                    # 解析年度纳税申报表
                    if biaoge == "中华人民共和国企业所得税年度纳税申报表(A类)\n":
                        if results_last == '金额\n' and a == 11:
                            sz = results.strip("").split("\n")
                            print(sz)
                        elif a == 10 and "%" in results and "0.00" in results:
                            sz = results.strip("").split("\n")
                            print(sz)
                    # 解析基础信息表
                    if biaoge == "A000000企业基础信息表\n":
                        if "备抵法" in results or "直接核销法" in results:
                            cbjj = results.strip("").split("\n")
                            print(cbjj)
                    if biaoge == "A000000企业基础信息表\n" and a == 8:
                        kjzz = results.strip("").split("\n")
                        try:
                            # match = re.search(r'201适用的会计准则或会计制度 (.*?)', kjzz[0])
                            # print(match.group(1))
                            kjzzz = kjzz[0].split(" ")
                            kjzzzd = kjzzz[1]
                            print(kjzzzd)
                        except:
                            kjzzzd = ""
                            print(kjzzzd)
                    if biaoge == "A000000企业基础信息表\n" and "否" in results:
                        jcx = results.strip("").split("\n")
                        if len(jcx) == 6:
                            jcxx = jcx
                            print(jcxx)
                        else:
                            continue
                    if biaoge == "A000000企业基础信息表\n" and "301企业主要股东" in results:
                        gd = True
                        gdxx = []
                    if biaoge == "A000000企业基础信息表\n" and gd:
                        if "证件" not in results and "主要股东" not in results and "经济性质" not in results and "投资比例" not in results and "国籍" not in results and "302中国境内" not in results and "公司财务室" not in results \
                                and "备抵法" not in results and "直接核销法" not in results and "人民币" not in results:
                            gdxx.append(results)
                    results_last = results
    pdf_dict = {}
    try:
        pdf_dict['所属行业明细'] = jcxx[2]
        pdf_dict['从业人数'] = jcxx[3]
        pdf_dict['存货计价方法'] = cbjj[1]
        pdf_dict['企业会计准则为'] = kjzzzd
        if "一般企业" in pdf_dict['企业会计准则为']:
            pdf_dict['企业会计准则为'] = "一般企业会计准则"
    except Exception as e:
        print(e)
        pdf_dict['所属行业明细'] = ""
        pdf_dict['从业人数'] = ""
        pdf_dict['存货计价方法'] = ""
        pdf_dict['企业会计准则为'] = ""
    try:
        index = 0
        for gl in gdxx:
            index += 1
            if "居民身份证" in gl or "营业执照" in gl:
                zjhm = gl.replace("\n", "")
                zjhm = zjhm.split('居民身份证')[1:]
                clean = []
                for g in zjhm:
                    if "营业执照" in g:
                        yy = g.split("营业执照")
                        if len(yy[0]) != 0:
                            clean.append("居民身份证")
                            clean.append(yy[0])
                        for zz in yy[1:]:
                            clean.append("营业执照")
                            clean.append(zz)
                    else:
                        clean.append("居民身份证")
                        clean.append(g)
                break
        tzxx = []
        end = index + len(clean)
        for tz in gdxx[index:end]:
            tz = tz.replace("\n", "")
            tzxx.append(tz)
        gj = []
        end2 = end + int(len(clean) / 2)
        for country in gdxx[end:end2]:
            country = country.replace("\n", "")
            gj.append(country)
        xm = []
        gs = int(len(clean) / 2)
        if index - 1 == gs:
            for mc in gdxx[:index - 1]:
                mc = mc.replace("\n", "")
                xm.append(mc)
        else:
            for mc in gdxx[:index - 1]:
                mc = mc.replace("\n", "")
                xm.append(mc)
            for mc in gdxx[end2:]:
                mc = mc.replace("\n", "")
                xm.append(mc)
        zhenghe = {}
        sb = 0
        for j in range(0, len(clean), 2):
            gdxxdict = {}
            if '其他单位证件' in clean[j]:
                gdxxdict["证件种类"] = "居民身份证"
            else:
                gdxxdict["证件种类"] = clean[j]
            gdxxdict["证件号码"] = clean[j + 1]
            gdxxdict["经济性质"] = tzxx[j]
            gdxxdict["投资比例"] = tzxx[j + 1]
            if "中华人民" in gj[sb] or "香港" in gj[sb]:
                gdxxdict["国籍"] = "中国"
            else:
                gdxxdict["国籍"] = gj[sb]
            gdxxdict["股东名称"] = xm[sb]
            wc = gdxxdict
            sb += 1
            zhenghe["{}".format(sb)] = wc
        pdf_dict['股东信息'] = zhenghe
        tzfxx2, tzfxx3, tzfxx4, tzfxx5, tzfxx6, tzfxx7, tzfxx8, tzfxx9, tzfxx10 = {}, {}, {}, {}, {}, {}, {}, {}, {}
        tzfxx1 = json.dumps(zhenghe, ensure_ascii=False)
        tzfxx2 = json.dumps(tzfxx2, ensure_ascii=False)
        tzfxx3 = json.dumps(tzfxx3, ensure_ascii=False)
        tzfxx4 = json.dumps(tzfxx4, ensure_ascii=False)
        tzfxx5 = json.dumps(tzfxx5, ensure_ascii=False)
        tzfxx6 = json.dumps(tzfxx6, ensure_ascii=False)
        tzfxx7 = json.dumps(tzfxx7, ensure_ascii=False)
        tzfxx8 = json.dumps(tzfxx8, ensure_ascii=False)
        tzfxx9 = json.dumps(tzfxx9, ensure_ascii=False)
        tzfxx10 = json.dumps(tzfxx10, ensure_ascii=False)
        # params = (
        #     self.batchid, "0", "0", self.companyid, self.customerid, tzfxx1, tzfxx2, tzfxx3, tzfxx4, tzfxx5,
        #     tzfxx6, tzfxx7, tzfxx8, tzfxx9, tzfxx10)
        # self.insert_db("[dbo].[Python_Serivce_GSTaxInfo_AddParent]", params)
    except:
        pass
    pdf_dict['纳税调整后所得'] = sz[18]
    ksmx = {}
    try:
        for i in range(len(nf) - 1):
            try:
                if nf[i] == "2016":
                    ksmx[nf[i]] = sz[18]
                else:
                    ksmx[nf[i]] = nstzhsd[i]
            except:
                ksmx[nf[i]] = nstzhsd[i]
    except:
        print("ksmx")
    pdf_dict["亏损明细"] = ksmx
    print(pdf_dict)
    return pdf_dict
Example #54
0
def parse():
    fp = open(path, 'rb')  # 以二进制读模式打开
    #用文件对象来创建一个pdf文档分析器
    praser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument(praser)
    # 连接分析器 与文档对象
    praser.set_document(doc)
    # 创建PDf 资源管理器 来管理共享资源
    rsrcmgr = PDFResourceManager()
    # 创建一个PDF设备对象
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    # 创建一个PDF解释器对象
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # 循环遍历列表,每次处理一个page的内容

    wb = Workbook()  #新建excel
    ws = wb.active

    # 记录page的行数
    text_number = 0

    for page in PDFPage.create_pages(doc):  # doc.get_pages() 获取page列表
        interpreter.process_page(page)
        # 接受该页面的LTPage对象
        layout = device.get_result()
        # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
        # 得到box
        page_container = []  #存储所有该page的字符串字典
        page_rows = []  #存储行位置数据
        for text_box in layout:
            if (isinstance(text_box, LTTextBox)):
                # 得到line
                for text_line in text_box:
                    if (isinstance(text_line, LTTextLine)):
                        # 得到每个字符
                        temp = []  # 存储得到的字符
                        temp_loc = []  #存储字符串位置
                        isfirst = True  #判断是否为字符串的第一个字符
                        for text_index in text_line:
                            # 判断是否为字符数据,并不断更新temp temp_loc
                            if (isinstance(text_index, LTChar)):
                                temp.append(text_index.get_text())
                                if isfirst == True:
                                    temp_loc.append(
                                        round(text_index.bbox[0], 3))
                                    temp_loc.append(
                                        round(text_index.bbox[1], 3))
                                    temp_loc.append(
                                        round(text_index.bbox[2], 3))
                                    temp_loc.append(
                                        round(text_index.bbox[3], 3))
                                    isfirst = False
                                temp_loc[2] = round(text_index.bbox[2], 3)
                                temp_loc[3] = round(text_index.bbox[3], 3)
                            # 判断是否为LTText,并将得到的字符串输入page_container的指定位置,最后更新temp 、temp_loc、 isfirst
                            elif (isinstance(text_index, LTText)):
                                # 如果page_rows没有该行的位置数据,则将数据信息插入page_container,page_rows
                                # if temp_loc[1] not in page_rows:
                                if is_not_in(page_rows, temp_loc[1]):
                                    insert_loc = insert_into_page_rows(
                                        page_rows, temp_loc[1])
                                    page_container.insert(
                                        insert_loc, [{
                                            'value': ''.join(temp),
                                            'location': temp_loc
                                        }])
                                    # page_rows.append(temp_loc[1])
                                    # page_container.append([{'value':''.join(temp),'location':temp_loc}])
                                # 如果有该行的信息
                                elif not is_not_in(page_rows, temp_loc[1]):
                                    # loc = page_rows.index(temp_loc[1])
                                    loc = get_page_rows_loc(
                                        page_rows, temp_loc[1])
                                    temp_list = insert_into_page_container(
                                        page_container[loc], {
                                            'value': ''.join(temp),
                                            'location': temp_loc
                                        })
                                    page_container[loc] = temp_list[:]
                                temp = []
                                temp_loc = []
                                isfirst = True

        # 记录当前page的行数
        rows_num = len(page_container)

        # 对最后一行进行重排
        if len(page_container[rows_num - 1]) != len(
                page_container[rows_num - 2]):
            page_container[rows_num - 1], unused_flag = align_row(
                page_container[rows_num - 2], page_container[rows_num - 1])

        # 对前五行进行表头检测,并重新排序
        if len(page_container[0]) != len(page_container[1]) or len(
                page_container[1]) != len(page_container[2]) or len(
                    page_container[2]) != len(page_container[3]) or len(
                        page_container[3]) != len(page_container[4]):
            rows_length = []
            the_max_row = []
            rejust_rows = []
            rejust_rows_num = []

            for i in range(7):  # 666666
                rows_length.append(len(page_container[i]))

            max_length = max(rows_length)
            the_max_row = page_container[rows_length.index(max_length)][:]

            for i in range(len(rows_length)):
                if rows_length[i] < max_length:
                    page_container[i], flag_for_title = align_row(
                        the_max_row, page_container[i])
                    if flag_for_title == False:
                        rejust_rows.append(page_container[i])
                        rejust_rows_num.append(i)

            # 合并在单元格中换行的元素(前五行)
            if rejust_rows != []:
                compiled_row = compile_row(rejust_rows)
                page_container.insert(rejust_rows_num[0], compiled_row)
                for i in range(len(rejust_rows_num)):
                    del page_container[rejust_rows_num[0] + 1]

        # 对表头进行处理后,记录当前page的行数
        rows_num = len(page_container)

        # 输出验证
        for i in range(len(page_container)):
            for j in range(len(page_container[i])):
                print(page_container[i][j])
        # print(page_container)
        # print(page_rows)

        # 得到该页数据以后写入excel
        for i in range(len(page_container)):
            for j in range(len(page_container[i])):
                cell_index = ws.cell(row=i + 1 + text_number, column=j + 1)
                if page_container[i][j] == None:
                    cell_index.value = ' '
                elif page_container[i][j]['value'] == '':
                    ws.merge_cells(start_row=i + 1 + text_number,
                                   start_column=1,
                                   end_row=i + 1 + text_number,
                                   end_column=len(page_container[i]))
                    break
                else:
                    cell_index.value = page_container[i][j]['value']

        # 更新text_number,保证page之间的数据连续
        text_number += rows_num

    wb.save(r'C:\Users\15644\Desktop\pdf_file\test_pdf_list\test_10.xlsx')
Example #55
0
def parse(pdf_file, code_source, code_new):
    pdf_path_ = "./"+watermark_dir+"/" + pdf_file
    with open(pdf_path_, 'rb') as pdf_io:
    # 用文件对象创建一个PDF文档分析器
    # parser = PDFParser(DataIO)
        parser = PDFParser(pdf_io)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 分析器和文档相互连接
    parser.set_document(doc)
    doc.set_parser(parser)
    # 提供初始化密码,没有默认为空
    doc.initialize()
    # 检查文档是否可以转成文本,如果不可以读取文本,抛出异常
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDF资源管理器,来管理共享资源
        rsrcmagr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        # 将资源管理器和设备对象聚合
        device = PDFPageAggregator(rsrcmagr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmagr, device)

        pg = 0
        # 循环遍历列表,每次处理一个page内容
        # print(doc.get_pages())
        # 获取page列表
        for page in doc.get_pages():
            size_x = []
            size_y = []
            size_font = []
            layout_x = 2448
            layout_y = 1584
            try:
                interpreter.process_page(page)
                # 接收该页面的LTPage对象
                layout = device.get_result()
                # 这里的layout是一个LTPage对象 里面存放着page解析出来的各种对象
                # 一般包括LTTextBox,LTFigure,LTImage,LTTextBoxHorizontal等等一些对像
                # 想要获取文本就得获取对象的text属性
                layout_x = int(layout.bbox[2])
                layout_y = int(layout.bbox[3])
                for x in layout:
                    # 读取所有文本
                    if isinstance(x, LTTextBoxHorizontal):
                        for line in x:
                            # 读取所有文本列
                            if isinstance(line, LTTextLine):
                                # 读取每一文本列
                                result = line.get_text().lower()
                                # 设置字符串匹配起始值
                                num_code = 0
                                # 匹配 原 字符串
                                # 判断要匹配的字符串code_source 中有几个匹配的值,循环几次
                                for i in range(result.count(code_source.lower())):
                                    # 关键字匹配,并获取在文本列的第几位
                                    codetwo = result.find(code_source, num_code)
                                    # 下次从这个值往后开始匹配
                                    num_code = codetwo + 1
                                    i = 0
                                    for char in line:
                                        # 读取每一个字符
                                        if isinstance(char, LTChar):
                                            # print(char.get_text())
                                            if i == codetwo:
                                                size_x.append(float(char.bbox[0]))
                                                size_y.append(float(char.bbox[1]))
                                                size_font.append(float(char.size))
                                        i += 1

            except e:
                print(e)
                print("The watermark file Create Failed")
                return 1
            # 根据匹配文本的坐标系,生成替换水印
            # 默认大小为
            name_without_postfix = os.path.splitext(pdf_file)[0]
            path_tmp = watermark_dir + "/" + name_without_postfix
            check_dir(path_tmp)
            mark = canvas.Canvas(path_tmp+"/" + str(pg) + ".pdf", pagesize=(layout_x, layout_y))

            #  生成每一页水印pdf ,将要代替的文字code_new ,写到匹配到的文字source_code位置,(用于合并覆盖)
            j = 0
            for i in size_x:
                # 移动坐标原点(坐标系左下为(0,0))
                x = float(i)
                y = float(size_y[j])

                # 指定描边的颜色
                mark.setStrokeColorRGB(1, 1, 1)
                # 指定填充颜色
                mark.setFillColorRGB(1, 1, 1)
                # 画一个矩形
                mark.rect(x, y, size_font[j] * 2.8, size_font[j] * 0.8, fill=1)
                j += 1
            z = 0
            for i in size_x:
                x = float(i)
                y = float(size_y[z])
                # 设置字体
                mark.setFont("Helvetica", size_font[z] * 0.8)

                # 指定填充颜色
                mark.setFillColorRGB(0, 0, 0)
                # 设置透明度,1为不透明
                mark.setFillAlpha(1)
                # 在水印pdf中写入一个字符,输入为全大写
                mark.drawString(x + (size_font[z] * 0.1), y + (size_font[z] * 0.125), code_new.upper())
                # 匹配是否为大文件,选择遮盖logo
                if size_font[z] > 20:
                    # 指定描边的颜色
                    mark.setStrokeColorRGB(1, 1, 1)
                    # 指定填充颜色
                    mark.setFillColorRGB(1, 1, 1)
                    # 画一个矩形
                    mark.rect(x - 35, y - 2, 35, size_font[z] * 0.9, fill=1)
                z += 1

            # 画一个空白矩阵
            mark.rect(0, 0, 0, 0, fill=1)
            # 关闭并保存pdf文件
            mark.save()
            pg += 1

        print('The watermark file Create Successful.')
def get_paper_content(fname, pages=2, outdir="data"):
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    basename = os.path.basename(fname)
    basename = basename.replace(".pdf", "")
    outfile = os.path.join(outdir, basename + ".html")
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()

    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              codec=codec,
                              laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=stripcontrol)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter,
                               debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return

    fp = file(fname, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    interpreter.debug = True
    try:
        for index, page in enumerate(
                PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True)):
            if index > pages:
                break
            page.rotate = (page.rotate + rotation) % 360
            interpreter.process_page(page)
    except:
        print(fname)
        return

    fp.close()
    device.close()
    outfp.close()
    return
Example #57
0
def main(argv):
    #codec = 'utf-8'
    codec = 'ascii'
    laparams = LAParams()
    pagenos = set()
    maxpages = 0
    password = ''
    caching = True
    rotation = 0
    rsrcmgr = PDFResourceManager(caching=caching)

    # Do a double read thanks to:
    # https://mail.python.org/pipermail/python-list/2009-April/531944.html
    mm = mmap.mmap(-1, 1024 * 1024 * 1024)

    device = TextConverter(rsrcmgr,
                           mm,
                           codec=codec,
                           laparams=laparams,
                           imagewriter=None)

    fname = argv[1]
    fp = file(fname, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        page.rotate = (page.rotate + rotation) % 360
        interpreter.process_page(page)
    fp.close()

    eof = mm.tell()
    device.close()
    mm.close()

    # Recreate the mmap area w/the correct size
    mm = mmap.mmap(-1, eof)

    device = TextConverter(rsrcmgr,
                           mm,
                           codec=codec,
                           laparams=laparams,
                           imagewriter=None)

    fname = argv[1]
    fp = file(fname, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        page.rotate = (page.rotate + rotation) % 360
        interpreter.process_page(page)
    fp.close()

    mm.seek(0)

    ip_regex = re.compile(
        r'((?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))'
    )
    hash_regex = re.compile(
        r'(?:[A-Fa-f0-9]{32}|[A-Fa-f0-9]{40}|[A-Fa-f0-9]{64})'
    )  # md5, sha1, sha256
    url_regex = re.compile(
        r'\b((?:[\w-]+://?|www[.])[A-Za-z0-9-_\/.%?=&\[\]()@!$#,;]+)',
        re.MULTILINE)
    hostname_regex = re.compile(
        r'([a-zA-Z\d-]{,63}(?:\.[a-zA-Z\d-]{,63}|\s\.\s[a-zA-Z\d-]{,63})+)',
        re.MULTILINE)
    single_line_hostname_regex = re.compile(
        r'([a-zA-Z\d-]{,63}(?:\.[a-zA-Z\d-]{,63}|\s\.\s[a-zA-Z\d-]{,63})+)')
    doc = ''
    while True:
        if mm.tell() >= eof:
            break
        doc += mm.readline().rstrip()

    m = re.findall(ip_regex, doc)
    if m != None and len(m) > 0: print set(m)
    m = re.findall(url_regex, doc)
    if m != None and len(m) > 0: print set(m)
    m = re.findall(hash_regex, doc)
    if m != None and len(m) > 0: print set(m)
    m = re.findall(hostname_regex, doc)
    hostname_candidates = []
    if m != None and len(m) > 0: hostname_candidates = list(set(m))
    m = re.findall(single_line_hostname_regex, doc)
    if m != None and len(m) > 0:
        hostname_candidates = list(set(m + hostname_candidates))

    if len(hostname_candidates) > 0:
        for h in hostname_candidates:
            domain = string.replace(h, ' ', '')
            #print h
            if uniaccept.verifytldoffline(
                    domain, "./tld-list.txt") and domain[-1] != '.':
                print h

    #print doc
    device.close()
    mm.close()
Example #58
0
def identify_name_pdf(path_file, operator_code):

    if operator_code == 'TIM':
        operator = 'TIM'
        reference = '(REF: [A-Z]{3}\/[0-9]{2})'
        account = '(CLIENTE: [0-9]{1}.[0-9]{7})'

    if operator_code == 'CLARO':
        operator = 'Claro'
        reference = '(Data de Emissão: [0-9-]{2}\/[0-9]{2}\/2020)'
        account = '(Nº da Conta: [0-9]{9})'

    with open(path_file, 'rb') as fp:
        rsrcmgr = PDFResourceManager()
        retstr = io.StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        pages = PDFPage.get_pages(fp)
        for page in pages:
            interpreter.process_page(page)
            data = retstr.getvalue()
            operator_find = re.findall(operator, data)
            reference_find = re.findall(reference, data)
            account_find = re.findall(account, data)

        operator = operator_find[0]
        reference = reference_find[0]
        account = account_find[0]

        if operator_code == 'TIM':
            account = account.replace('.', '')
            account = account.replace('CLIENTE: ', '')

            reference = reference.replace('REF: ', '')
            reference = dateparser.parse(
                reference, settings={'TIMEZONE': 'America/Sao_Paulo'})
            reference = datetime.strftime(reference, "%Y%m")

        if operator_code == 'CLARO':
            operator = operator.upper()

            reference = reference.replace('Data de Emissão: ', '')
            reference = reference[3:10]
            reference = dateparser.parse(
                reference, settings={'TIMEZONE': 'America/Sao_Paulo'})
            reference = datetime.strftime(reference, "%Y%m")

            account = account.replace('Nº da Conta: ', '')

        path_file = "C:\\repositorio\\teste_pdf\\pdf_split\\"
        # path_file = 'C:\\Users\\BRCAP-BI01\\Desktop\\Claro Car\\'

        new_name_pdf = path_file + str(
            reference) + '-' + operator + '-' + account + '.pdf'
        print(new_name_pdf)
        return new_name_pdf


# if __name__ == "__main__":
#     path = 'C:\\Users\\BRCAP-BI01\\Desktop\\Claro Car\\bf.blim_307024353_002_M1_PS.TAMB[1].pdf'
#     new_name_pdf = identify_name_pdf(path, 'CLARO')
#     os.rename(path, new_name_pdf)
Example #59
0
def postlist_son(request):
    if 'delete_list' in request.POST:  # 删子文件夹-批量
        pdf_id = request.POST.getlist("d2p_list")
        file_infos = FileInfo.objects.filter(id__in=pdf_id)
        for file_info in file_infos:
            if file_info.file_type == 'FOLDER':
                delete_all(file_info.file_path)
                file_info.delete()
            else:
                os.remove(file_info.file_path)
                file_info.delete()
        file_infos = FileInfo.objects.all()
        messages.success(request, "文件删除成功!")
    elif 'mpdf_list' in request.POST:  # 合并PDF文件夹-批量
        output = PdfFileWriter()
        outputPages = 0
        output_name = ''
        pdf_id = request.POST.getlist("d2p_list")
        file_infos = FileInfo.objects.filter(id__in=pdf_id)
        for file_info in file_infos:
            # 读取源PDF文件
            input = PdfFileReader(open(file_info.file_path, "rb"))

            # 获得源PDF文件中页面总数
            pageCount = input.getNumPages()
            outputPages += pageCount
            print("页数:%d" % pageCount)

            # 分别将page添加到输出output中
            for iPage in range(pageCount):
                output.addPage(input.getPage(iPage))
            # output_name=output_name+file_info.file_name.split('.')[0][0]+'-'
        output_name = '整合-' + file_info.folder_name.split('\\')[-1]

        # 写入到目标PDF文件
        outputStream = open(
            file_infos[0].folder_name + '\\' + output_name + '.pdf', "wb")
        output.write(outputStream)
        outputStream.close()
        file_info1 = FileInfo(file_name=output_name + '.pdf',
                              file_path=file_infos[0].folder_name + '\\' +
                              output_name + '.pdf',
                              file_type='pdf',
                              load_user=get_user(request),
                              is_personal=int(B),
                              folder_name=file_infos[0].folder_name)
        file_info1.save()
        file_size1 = os.path.getsize(file_infos[0].folder_name + '\\' +
                                     output_name + '.pdf')
        FileInfo.objects.filter(
            file_path=file_infos[0].folder_name + '\\' + output_name +
            '.pdf').update(
                file_size=1 if 0 < file_size1 < 1024 else file_size1 / 1024)
        messages.success(request, "PDF合并成功!")
    elif 'download_list' in request.POST:  # 下载-子文件夹-批量
        download_id = request.POST.getlist("d2p_list")
        file_infos = FileInfo.objects.filter(id__in=download_id)
        # print('下载的文件名:' + file_info.file_name)
        for file_info in file_infos:
            file = open(file_info.file_path, 'rb')
            response = FileResponse(file)
            response[
                'Content-Disposition'] = 'attachment;filename="%s"' % urlquote(
                    file_info.file_name)
            return response
    elif 'tj_list' in request.POST:
        word_id = request.POST.getlist("d2p_list")
        file_infos = FileInfo.objects.filter(id__in=word_id)
        for file_info in file_infos:
            in_file = file_info.file_path
            out_file = file_info.file_path.split(".")[0] + ".doc"
            fp = open(in_file, 'rb')  # 以二进制读模式打开
            # 用文件对象来创建一个pdf文档分析器
            parser = PDFParser(fp)
            # 创建一个PDF文档
            doc = PDFDocument()
            # 连接分析器 与文档对象
            parser.set_document(doc)
            doc.set_parser(parser)
            # 提供初始化密码
            # 如果没有密码 就创建一个空的字符串
            doc.initialize()
            if not doc.is_extractable:
                raise PDFTextExtractionNotAllowed
            else:
                # 创建PDf 资源管理器 来管理共享资源
                rsrcmgr = PDFResourceManager()
                # 创建一个PDF设备对象
                laparams = LAParams()
                device = PDFPageAggregator(rsrcmgr, laparams=laparams)
                # 创建一个PDF解释器对象
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                # 用来计数页面,图片,曲线,figure,水平文本框等对象的数量
                num_page, num_image, num_curve, num_figure, num_TextBoxHorizontal = 0, 0, 0, 0, 0
                for page in doc.get_pages():  # doc.get_pages() 获取page列表
                    num_page += 1  # 页面增一
                    interpreter.process_page(page)
                    # 接受该页面的LTPage对象
                    layout = device.get_result()
                    for x in layout:
                        if isinstance(x, LTImage):  # 图片对象
                            num_image += 1
                        if isinstance(x, LTCurve):  # 曲线对象
                            num_curve += 1
                        if isinstance(x, LTFigure):  # figure对象
                            num_figure += 1
                        if isinstance(x, LTTextBoxHorizontal):  # 获取文本内容
                            num_TextBoxHorizontal += 1  # 水平文本框对象增一
                            # 保存文本内容
                            with open(out_file, 'a',
                                      encoding='utf-8') as f:  #生成doc文件的文件名及路径
                                results = x.get_text()
                                f.write(results)
                                f.write('\n')
                print('对象数量:\n', '页面数:%s\n' % num_page, '图片数:%s\n' % num_image,
                      '曲线数:%s\n' % num_curve,
                      '水平文本框:%s\n' % num_TextBoxHorizontal)
                file_info1 = FileInfo(
                    file_name=file_info.file_name.split('.')[0] + '.doc',
                    file_path=out_file,
                    file_type='doc',
                    load_user=get_user(request),
                    is_personal=int(B),
                    folder_name=file_info.folder_name)
                file_info1.save()
                file_size1 = os.path.getsize(out_file)
                FileInfo.objects.filter(file_path=out_file).update(
                    file_size=1 if 0 < file_size1 < 1024 else file_size1 /
                    1024)
        messages.success(request, "DOC转换成功!")
    else:
        pdf_id = request.POST.getlist("d2p_list")
        file_infos = FileInfo.objects.filter(id__in=pdf_id)
        for file_info in file_infos:
            in_file = file_info.file_path
            out_file = file_info.file_path.split(".")[0] + ".pdf"
            pythoncom.CoInitialize()
            word = win32com.client.Dispatch('Word.Application')
            doc = word.Documents.Open(in_file)
            doc.SaveAs(out_file, FileFormat=17)
            doc.Close()
            time.sleep(1)
            file_info1 = FileInfo(file_name=file_info.file_name.split('.')[0] +
                                  '.pdf',
                                  file_path=out_file,
                                  file_type='pdf',
                                  load_user=get_user(request),
                                  is_personal=int(B),
                                  folder_name=file_info.folder_name)
            file_info1.save()
            file_size1 = os.path.getsize(out_file)
            FileInfo.objects.filter(file_path=out_file).update(
                file_size=1 if 0 < file_size1 < 1024 else file_size1 / 1024)
        messages.success(request, "PDF转换成功!")
    return HttpResponseRedirect(reverse('fileserver:list1', args=[a]))
Example #60
0
    def testOnePriceRu(self):
        elem = self.driver.find_element_by_id("comp_name")
        elem.send_keys("ООО Евроторг")
        elem = self.driver.find_element_by_id("date")
        elem.send_keys("13.01.2020")
        # выбор белорусского рубля в выпадающем списке валют
        elem = self.driver.find_element_by_name("valyuta")
        elem.click()
        options = elem.find_elements_by_tag_name("option")
        for option in options:
            if option.text == "Белорусский рубль, 974":
                option.click()
                break

        elem = self.driver.find_element_by_id("tovar_ed_default")
        elem.send_keys("кг")
        elem = self.driver.find_element_by_id("tovar_country_default")
        elem.send_keys("РБ")

        # таблица с товарами
        elem = self.driver.find_element_by_id("tab1")
        tbody = elem.find_element_by_tag_name("tbody")
        tr = tbody.find_element_by_tag_name("tr")
        # название товара
        td = tr.find_element_by_class_name("tovar_name")
        field = td.find_element_by_tag_name("textarea")
        field.send_keys("Конфеты Южная ночь")
        # цена товара
        td = tr.find_element_by_class_name("tovar_cena")
        field = td.find_element_by_tag_name("input")
        field.send_keys("10,55")

        # жмем ссылку "Скачать"
        elem = self.driver.find_element_by_id("download")
        elem.click()
        # 10 сек ожидания
        # на случай, если Firefox спросит, сохранять файл
        time.sleep(10)

        # проверяем наличие сохраненного файла по названию
        today = datetime.date.today()
        fullpath = (self.savePath + "cenniki-new-" +
                    today.strftime("%Y-%m-%d") + ".pdf")
        self.assertEqual(os.path.isfile(fullpath), True)

        # получаем текст из сохраненного файла
        # открываем файл
        fh = open(fullpath, 'rb')
        # открываем первую страницу
        page_obj = PDFPage.get_pages(fh, caching=True,
                                     check_extractable=True).__next__()

        resource_manager = PDFResourceManager()
        # создаем объект для вывода текста
        fake_file_handle = io.StringIO()
        # создаем конвертер для извлечения текста из PDF
        converter = TextConverter(resource_manager, fake_file_handle)
        # создаем интерпретатор страницы
        page_interpreter = PDFPageInterpreter(resource_manager, converter)
        # извлекаем текст страницы
        page_interpreter.process_page(page_obj)
        # забираем текст страницы в переменную page
        page = fake_file_handle.getvalue()
        # уничтожаем созданные объекты
        converter.close()
        fake_file_handle.close()

        # проверяем наличие введенных значений в тексте файла
        print(page)
        self.assertIn("ООО Евроторг", page)
        self.assertIn("13.01.2020", page)
        self.assertIn("кг", page)
        self.assertIn("РБ", page)
        self.assertIn("Конфеты Южная ночь", page)
        # цена
        self.assertIn("10", page)
        self.assertIn("55", page)