Example #1
0
    def parse_pdf_pdfminer(self, f, fpath):
        try:
            laparams = LAParams()
            laparams.all_texts = True  
            rsrcmgr = PDFResourceManager()
            pagenos = set()

            if self.dedup:
                self.dedup_store = set()

            self.handler.print_header(fpath)
            page_num = 0
            parser= PDFParser(f)
            doc = PDFDocument(caching=True)

            parser.set_document(doc)
            doc.set_parser(parser)
            for page in doc.get_pages():
                retstr = StringIO()
                device = TextConverter(rsrcmgr, retstr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                page_num += 1
                interpreter.process_page(page)
                data = retstr.getvalue()
                self.parse_page(fpath, bytes(data,'UTF-8'), page_num)
                retstr.close()
            self.handler.print_footer(fpath)
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            self.handler.print_error(fpath, e)
Example #2
0
    def _convert_pdf_to_text(self, password=None):
    	input_pdf = self.cvFile
    	if password is not None:
	    self.cvFilePasswd = password
    	pagenos = range(0, 30)
    	maxpages = pagenos.__len__()
    	layoutmode = 'normal'
    	codec = 'utf-8'
    	scale = 1
    	outtype = 'txt'
    	laparams = LAParams()
    	laparams.all_texts = True
    	laparams.showpageno = True
    	outputPath = self.scratchDir
    	inputPath = os.getcwd()
    	if os.path.exists(input_pdf):
            inputPath = os.path.dirname(input_pdf)
    	input_filename = os.path.basename(input_pdf)
    	input_parts = input_filename.split(".")
    	input_parts.pop()
	randomStr = int(time.time())
    	output_filename = outputPath + os.path.sep + ".".join(input_parts) + randomStr.__str__() + r".txt"
	self.cvTextFile = output_filename
	outfp = file(output_filename, 'w')
    	rsrcmgr = PDFResourceManager()
    	device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    	fp = file(input_pdf, 'rb')
    	process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=self.cvFilePasswd, check_extractable=True)
    	fp.close()
    	device.close()
    	outfp.close()
    	return (0)
Example #3
0
def get_result_from_file(filename):
    from pdfminer.pdfparser import PDFParser
    from pdfminer.pdfdocument import PDFDocument
    from pdfminer.pdfpage import PDFPage
    from pdfminer.pdfpage import PDFTextExtractionNotAllowed
    from pdfminer.pdfinterp import PDFResourceManager
    from pdfminer.pdfinterp import PDFPageInterpreter
    from pdfminer.converter import PDFPageAggregator
    from pdfminer.layout import LAParams

    result = {"filename": filename, "pages": []}
    fp = open(filename, "rb")
    parser = PDFParser(fp)
    document = PDFDocument(parser)
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    laparams.char_margin = 2.0
    laparams.detect_vertical = True
    laparams.line_margin = 1.0
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    page_index = 0
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
        layout = device.get_result()
        bounding_box = get_bounding_box(layout)
        labels = get_text_labels(layout)
        result["pages"].append({"index": page_index, "bounding_box": bounding_box, "labels": labels})
        page_index += 1
    fp.close()
    return result
Example #4
0
    def dump_pdf_pdfminer(self, fpath_in):
        fpath_out = os.path.splitext(fpath_in)[0] + ".txt"
        n = 0

        with open(fpath_in, 'rb') as fin:
            with open(fpath_out, 'wb') as fout:
                try:
                    laparams = LAParams()
                    laparams.all_texts = True  
                    rsrcmgr = PDFResourceManager()
                    pagenos = set()

                    page_num = 0
                    for page in PDFPage.get_pages(fin, pagenos, check_extractable=True):
                        page_num += 1

                        retstr = StringIO()
                        device = TextConverter(rsrcmgr, retstr, laparams=laparams)
                        interpreter = PDFPageInterpreter(rsrcmgr, device)
                        interpreter.process_page(page)
                        data = retstr.getvalue()
                        retstr.close()

                        fout.write(data)
                        n += len(data)
                    print "Written %d bytes to %s" % (n, fpath_out)
                except (KeyboardInterrupt, SystemExit):
                    raise
                except Exception as e:
                    print "Failed parsing %s" % (fpath_in)
Example #5
0
    def count_words(self):
        """
        Thanks to http://pinkyslemma.com/2013/07/02/word-frequency-from-pdfs/
        and http://www.unixuser.org/~euske/python/pdfminer/programming.html
        """
        with open(self.filename, "rb") as fp:
            rsrcmgr = PDFResourceManager()
            retstr = StringIO()
            codec = 'utf-8'
            laparams = LAParams()
            laparams.all_texts = True
            device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

            parser = PDFParser(fp)
            # Create a PDF document object that stores the document structure.
            # Supply the password for initialization.
            document = PDFDocument(parser)
            # Check if the document allows text extraction. If not, abort.
            if not document.is_extractable:
                raise PDFTextExtractionNotAllowed

            # Create a PDF interpreter object.
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            # Process each page contained in the document.
            for page in PDFPage.create_pages(document):
                interpreter.process_page(page)

            full_text = retstr.getvalue()
            full_text = full_text.translate(string.maketrans("", ""), string.punctuation)

            return len(full_text.split())
Example #6
0
def initialize_pdf_miner(fh):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fh)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument(parser)
    # Connect the parser and document objects.
    parser.set_document(doc)
    #doc.set_parser(parser)
    # Supply the password for initialization.
    # (If no password is set, give an empty string.)
    #doc.initialize("")
    # Check if the document allows text extraction. If not, abort.
    if not doc.is_extractable:
        pass
        #raise ValueError("PDFDocument is_extractable was False.")
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    # for page in doc.get_pages():
    #    interpreter.process_page(page)

    # Set parameters for analysis.
    laparams = LAParams(line_overlap=0.3, char_margin=1.0, line_margin=0.5, word_margin=0.1,
            boxes_flow=0.1, detect_vertical=False, all_texts=False)
    laparams.word_margin = 0.0
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    return doc, interpreter, device
Example #7
0
def pdf2xml(infile):
    '''
    Return a string of XML representation for given PDF file handle.
    Uses pdfminer to do the conversion and does some final post-processing.
    '''

    outfile = StringIO()

    # Empirically determined...
    laparams = LAParams()
    laparams.char_margin = 0.4

    # See pdf2txt.py
    rsrcmgr = PDFResourceManager(caching=False)
    device = XMLConverter(rsrcmgr, outfile, codec='utf-8', laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    if page_api:
        for page in PDFPage.get_pages(infile, set()):
            interpreter.process_page(page)
    else:
        process_pdf(rsrcmgr, device, infile, set())

    infile.close()
    return outfile.getvalue().replace("\n", "")
Example #8
0
def pdf2str(path):

    #Allocate resources
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    
    #Set parameters
    codec = 'utf-8'
    laparams.all_texts=True
    laparams.detect_vertical = True
    caching = True
    pagenos = set()

    #Initialize the converter
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

    #Open the file and parse
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    for page in PDFPage.get_pages(fp, pagenos,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    #Clean up
    fp.close()
    device.close()
    str = retstr.getvalue()
    retstr.close()
    return str
Example #9
0
    def parse(self, path):
		out = StringIO.StringIO()
		fp = None
        # Directory
		if os.path.isdir(path):
			raise NotImplementedError()
        # File
	       	else:
			fp = file(path)		
		rsrc = PDFResourceManager()
		codec = 'utf-8'
		laparams = LAParams()
		laparams.char_margin = 2.0
		laparams.line_margin = 2.0
		laparams.word_margin = 0.0
		device = TextConverter(rsrc, out, codec=codec, laparams=laparams)
		doc = PDFDocument()
		parser = PDFParser(fp)
		parser.set_document(doc)
		doc.set_parser(parser)
		doc.initialize()
		interpreter = PDFPageInterpreter(rsrc, device)
		for page in doc.get_pages():
			interpreter.process_page(page)
		device.close()
		sample = Sample(path, None, out.getvalue())
		out.close()
		return sample
Example #10
0
    def parse_pdf_pdfminer(self, f, fpath):
        try:
            laparams = LAParams()
            laparams.all_texts = True  
            rsrcmgr = PDFResourceManager()
            pagenos = set()

            if self.dedup:
                self.dedup_store = set()

            self.handler.print_header(fpath)
            page_num = 0
            for page in PDFPage.get_pages(f, pagenos, check_extractable=True):
                page_num += 1

                retstr = StringIO()
                device = TextConverter(rsrcmgr, retstr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                interpreter.process_page(page)
                data = retstr.getvalue()
                retstr.close()

                self.parse_page(fpath, data, page_num)
            self.handler.print_footer(fpath)
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            self.handler.print_error(fpath, e)
Example #11
0
def _pdf_to_text(path):

    try:
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'ascii'
        laparams = LAParams()
        laparams.all_texts = True
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

        with open(path, 'rb') as fp:
            process_pdf(rsrcmgr, device, fp)
            device.close()

            # fix the non-utf8 string ...
            result = retstr.getvalue()
            txt = result.encode('ascii','ignore')

            retVal = (txt,True)
            retstr.close()

    except Exception,e:
        #print str(e)
        #print "\tERROR: PDF is not formatted correctly, aborting."
        retVal = ("", False)
        pass
Example #12
0
def initialize_pdf_miner(fh, password = None):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fh)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument(parser, password)

    # Check if the document allows text extraction. If not, abort.
    if not doc.is_extractable:
        raise ValueError("PDFDocument is_extractable was False.")
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    # for page in doc.get_pages():
    #    interpreter.process_page(page)

    # Set parameters for analysis.
    laparams = LAParams()
    laparams.word_margin = 0.0
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    return doc, interpreter, device
Example #13
0
    def __init__(self, line_overlap=0.5, header_perc=7.5, footer_perc=7.5):

        LAParams.__init__(self, line_overlap=line_overlap, char_margin=line_overlap,
                          line_margin=line_overlap, word_margin=line_overlap,
                          boxes_flow=line_overlap, detect_vertical=False, all_texts=False)

        self.header_perc = header_perc  # Fraction of the header (% of the page) 
        self.footer_perc = footer_perc  # Fraction of the footer (% of the page)
        return
Example #14
0
    def parse_pdf(self, test_parse=False):
        """
            Parse a PDF and return text contents as an array
        """

        dtpo_log("debug", "parsePDF sourceFile -> '%s'", self.source_file)

        # input options
        pagenos = set()
        maxpages = 0
        # output option
        codec = "utf-8"
        caching = True
        laparams = LAParams()
        laparams.char_margin = 8.0
        laparams.word_margin = 2.0

        rsrcmgr = PDFResourceManager(caching=caching)

        try:
            outfp = file(self.text_file, "w")
        except IOError as io_error:
            raise DTPOFileError(self.text_file, 0, str(io_error))

        try:
            fp = file(self.source_file, "rb")
        except IOError as io_error:
            raise DTPOFileError(self.source_file, 0, str(io_error))

        try:
            device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
            process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, caching=caching, check_extractable=True)

        except PDFException as pdf_error:
            message = "Failed to parse file {0} -> {1}".format(self.source_file, str(pdf_error))
            raise DTPOFileError(self.source_file, 0, message)
        except Exception as exception:
            message = "Failed to parse PDF file Unknown exception {0} - > {1}".format(type(exception), str(exception))
            raise DTPOFileError(self.source_file, 0, message)

        fp.close()
        device.close()
        outfp.close()

        #   Got the PDF converted = now get it into an array
        self.file_array = []
        for line in open(self.text_file):
            self.file_array.append(line)

        #   Remove the last entry - it's always '\x0c'
        if len(self.file_array) > 0:
            del self.file_array[-1]

        #   Remove the outfile
        if not test_parse:
            os.remove(self.text_file)
Example #15
0
def to_text(path):
    """Wrapper around `pdfminer`.

    Parameters
    ----------
    path : str
        path of electronic invoice in PDF

    Returns
    -------
    str : str
        returns extracted text from pdf

    """

    try:
        # python 2
        from StringIO import StringIO
        import sys

        reload(sys)  # noqa: F821
        sys.setdefaultencoding('utf8')
    except ImportError:
        from io import StringIO

    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfpage import PDFPage

    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    laparams.all_texts = True
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    with open(path, 'rb') as fp:
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()
        pages = PDFPage.get_pages(
            fp,
            pagenos,
            maxpages=maxpages,
            password=password,
            caching=caching,
            check_extractable=True,
        )
        for page in pages:
            interpreter.process_page(page)
    device.close()
    str = retstr.getvalue()
    retstr.close()
    return str.encode('utf-8')
Example #16
0
 def to_text(self):
     rsrcmgr = PDFResourceManager()
     output = StringIO()
     laparams = LAParams()
     laparams.detect_vertical = True
     laparams.all_texts = True
     laparams.word_margin = 0.4
     device = TextConverter(rsrcmgr, output, laparams=laparams)
     interpreter = PDFPageInterpreter(rsrcmgr, device)
     for page in self._doc.get_pages():
             interpreter.process_page(page)
     return output.getvalue().decode('utf-8', 'ignore')
Example #17
0
    def get_text(self):
        """Returns all text content from the PDF as plain text.
        """
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        laparams.all_texts = True
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

        try:
            file_pointer = file(self.path, 'rb')
            process_pdf(rsrcmgr, device, file_pointer)
        except Exception as e:
            logging.error("Error processing PDF: %s" % e)
            raise
        finally:
            file_pointer.close()
            device.close()

        text = retstr.getvalue()
        retstr.close()
        if (text is None) or (text.strip() == ""):
            logging.info("No text found in PDF. Attempting OCR. This will take a while.")
            #FIXME this should go in a separate method
            #First, convert to image
            import subprocess
            try:
                arglist = ["gs",
                      "-dNOPAUSE",
                      "-sOutputFile=temp/page%03d.png",
                      "-sDEVICE=png16m",
                      "-r72",
                      self.path]
                process = subprocess.call(
                    args=arglist,
                    stdout=subprocess.STDOUT,
                    stderr=subprocess.STDOUT)
            except OSError:
                logging.error("Failed to run GhostScript (using `gs`)")
            #Do OCR
            import time
            time.sleep(1) # make sure the server has time to write the files
            import Image
            import pytesseract
            import os
            text = ""
            for file_ in os.listdir("temp"):
                if file_.endswith(".png"):
                    text += pytesseract.image_to_string(Image.open("temp/" + file_), lang="swe")
                    os.unlink("temp/" + file_)
        self.text = text
        return text
Example #18
0
def GetScript(filename):
    global scriptName
    ResetGlobals()
    scriptName = filename
    password = ""
    # Open a PDF file.
    fp = open(filename, 'rb')
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser, password)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        print "---Not translatable---"
        return
        #raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
    
    # Set parameters for analysis.
    laparams = LAParams()
    laparams.boxes_flow = 2
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for pgnum,page in enumerate(PDFPage.create_pages(document)):
        if pgnum == 0:
            continue
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        text = []
        for page in layout:
            try:
                if page.get_text().strip():
                    text.append(TextBlock(page.x0,page.y1,page.get_text().strip()))
            except:
                temp=5  
            print ".",
        text.sort(key = lambda row:(-row.y))
        # Parse all of the "line" objects in each page
        for line in text:
            ParseLine(line.text, line.x)
Example #19
0
    def _pdf2text(self,fp):
        try:
            rsrcmgr = PDFResourceManager()
            retstr = StringIO()
            codec = 'ascii'
            laparams = LAParams()
            laparams.all_texts = True
            device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

            process_pdf(rsrcmgr, device, fp)
            device.close()

            # fix the non-utf8 string ...
            result = retstr.getvalue()
            txt = result.encode('ascii','ignore')

            # TODO: clean this up, I feel like I'm doing the converstion twice ...
            # http://stackoverflow.com/a/16503222/2154772
            parser = PDFParser(fp)
            doc = PDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
            doc.initialize()
            #print doc.info[0]['CreationDate'].resolve()
            
            #
            # as messed up as this is ... CreationDate isn't always the same type as it
            # comes back from the PDFParser, so we need to base it on an instance of a
            # basestring or not.
            #
            created = ""
            try:
                if not isinstance(doc.info[0]['CreationDate'],basestring):
                    creatd = doc.info[0]['CreationDate'].resolve()[2:-7]
                else:
                    created = doc.info[0]['CreationDate'][2:-7]
            except:
                self._report("CreationDate field could not be decoded within PDF, setting to ''")
                pass
            created = created.encode('ascii','ignore')
            retVal = (created,txt,True)
            retstr.close()
        except Exception, e:
            self._report("Error: \n\t%s" % str(e))
            retVal = (None,"",False)
            pass
Example #20
0
def extractrefs(infile, outfile):
    pagenos = set()
    caching = True
    infp = open(infile, 'rb')
    outfp = open(outfile, 'w')

    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    laparams.line_margin = 1.4
    device = RefsExtractor(rsrcmgr, outfp, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(infp, pagenos,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    infp.close()
    outfp.close()
Example #21
0
def initialize_pdf_interpreter():
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Create a PDF device object.
    device = PDFDevice(rsrcmgr)
    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    # for page in doc.get_pages():
    #    interpreter.process_page(page)

    # Set parameters for analysis.
    laparams = LAParams()
    laparams.word_margin = 0.0
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    return interpreter, device
Example #22
0
    def _pdf2text(self,fp):
        try:
            rsrcmgr = PDFResourceManager()
            retstr = StringIO()
            codec = 'ascii'
            laparams = LAParams()
            laparams.all_texts = True
            device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

            process_pdf(rsrcmgr, device, fp)
            device.close()

            # fix the non-utf8 string ...
            result = retstr.getvalue()
            txt = result.encode('ascii','ignore')

            # TODO: clean this up, I feel like I'm doing the converstion twice ...
            # http://stackoverflow.com/a/16503222/2154772
            parser = PDFParser(fp)
            doc = PDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
            doc.initialize()
            #print doc.info[0]['CreationDate'].resolve()
            
            #
            # as messed up as this is ... CreationDate isn't always the same type as it
            # comes back from the PDFParser, so we need to base it on an instance of a
            # basestring or not.  I'm starting to dislike PDFs ...
            #
            if not isinstance(doc.info[0]['CreationDate'],basestring):
                datestring = doc.info[0]['CreationDate'].resolve()[2:-7]
            else:
                datestring = doc.info[0]['CreationDate'][2:-7]
            #print "working on '{0}'...".format(datestring)
            ts = strptime(datestring, "%Y%m%d%H%M%S")
            created = datetime.fromtimestamp(mktime(ts))

            retVal = (created,txt,True)
            retstr.close()
        except Exception, e:
            self._reportstr("Error: \n\t%s" %str(e))
            retVal = (None,"",False)
            pass
def getPdfAsText(pdfPages = None, fileDescriptor = None):
    if pdfPages is None and fileDescriptor is not None:
        pdfPages = getPdfPages(fileDescriptor)

    resourceManager = PDFResourceManager()
    laparams = LAParams()
    laparams.all_texts = True
    laparams.detect_vertical = True

    try:
        outputStream = StringIO.StringIO()
        device = TextConverter(resourceManager, outputStream, laparams=laparams)
        intrepreter = PDFPageInterpreter(resourceManager, device)
        for pdfPage in pdfPages:
            intrepreter.process_page(pdfPage)
        return outputStream.getvalue()
    finally:
        device.close()
        outputStream.close()
Example #24
0
def pdf(f):
    rsrcmgr = PDFResourceManager()
    retstr = cStringIO.StringIO()
    codec = 'utf-8'

    laparams = LAParams()
    laparams.all_texts = True

    device = TextConverter(
        rsrcmgr, retstr, codec=codec, laparams=laparams
    )

    fp = file(f, 'rb')
    process_pdf(rsrcmgr, device, fp)
    fp.close()
    device.close()

    str = retstr.getvalue()
    retstr.close()
    return str
Example #25
0
def output_pdf_to_table(path):

    fp = open(path, "rb")
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    laparams.line_margin = line_margin_threshold
    codec = 'utf-8'
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    password=""
    maxpages=pages_to_view
    caching=True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, 
        password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)
        layout = device.get_result()
        getRows(layout)
Example #26
0
def readpdf(pdfFile):
    fp = open(pdfFile, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument(parser)
    #doc.initialize('password') # leave empty for no password

    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    laparams.char_margin = float('1.1') #too small and it splits the description, too big and Quantity-Unit-Part number are not separated: 1.1 seems to work
    laparams.line_margin = float('0.8')
    device = PDFPageDetailedAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    for page in PDFPage.create_pages(doc):
        interpreter.process_page(page)
        # receive the LTPage object for this page
        device.get_result()

    #print(device.rows)
    df = pd.DataFrame(device.rows, columns=['Page', 'x', 'y', 'c1','c2','String'])
    return df
Example #27
0
def to_text(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    print laparams
    laparams.all_texts = True
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    str = retstr.getvalue()
    retstr.close()
    return str
Example #28
0
def pdfcn():
    laparams = LAParams()
    laparams.all_texts = True
    ###从之前程序崩溃的地方重新启动,查找转换的pdf
    filelist4 = []
    finallist = []
    path2 = r'D:\dataset\acl10_12_txt'
    filelist2 = os.listdir(path2)
    path3 = r'D:\dataset\acl10_12s'
    filelist3 = os.listdir(path3)
    for i in filelist2:
        filelist4.append(i[:-4])
    print filelist4
    for filename in filelist3:
        #print filename[:-4]
        if filename[:-4] not in filelist4:
            finallist.append(filename[:-4])
            #print finallist


    #path = r'D:\dataset\aclpdf2'
    #filelist = os.listdir(path)
    for pdf in finallist:
        try:
            outfile = "D:\\dataset\\acl10_12_txt\\"+pdf+".txt"
            codec = 'utf-8'
            args = [path3+'\\'+pdf+'.pdf']
            rsrc = PDFResourceManager()
            outfp = file(outfile, 'w')
            device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
            for fname in args:
                fp = file(fname, 'rb')
                process_pdf(rsrc, device, fp, None, maxpages=0, password='')
                print '%s finishing ' % pdf
                fp.close()
        except:
            continue
    device.close()
    outfp.close()
Example #29
0
def convert_pdf_to_txt(path, txtname, buf=True):
    rsrcmgr = PDFResourceManager()
    if buf:
        outfp = StringIO()
    else:
        outfp = file(txtname, 'w')
    codec = 'utf-8'
    laparams = LAParams()
    laparams.detect_vertical = True
#    device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    device = TextConverter(rsrcmgr, outfp,  laparams=laparams)

    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
    fp.close()
    device.close()
    if buf:
        text = re.sub(space, "", outfp.getvalue())
        print (text)
    outfp.close()
Example #30
0
    def read_file(self):
        with open(self.path, 'rb') as f:
            parser = PDFParser(f)
        doc = PDFDocument()
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        laparams.char_margin = 0.1
        laparams.word_margin = 1.0
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        extracted_text = []

        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            for lt_obj in layout:
                if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                    extracted_text.append(lt_obj.get_text())
        self.content = ' '.join(extracted_text)
Example #31
0
def get_text_from_pdf(pdfname, caption, skip_header, skip_footer):
    # PDF 読み込み
    fp = open(pdfname, 'rb')
    texts = []

    for page in tqdm(
            PDFPage.get_pages(fp,
                              pagenos=None,
                              maxpages=0,
                              password=None,
                              caching=True,
                              check_extractable=True)):
        rsrcmgr = PDFResourceManager()
        out_fp = StringIO()
        la_params = LAParams()
        la_params.detect_vertical = True
        device = TextConverter(rsrcmgr,
                               out_fp,
                               codec='utf-8',
                               laparams=la_params)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        interpreter.process_page(page)
        texts.append(out_fp.getvalue())
        device.close()
        out_fp.close()
    fp.close()

    output = ""

    # 文章成形
    for text in tqdm(texts):
        lines = text.splitlines()
        replace_strs = [b'\x00']  # 除去するutf8文字
        new_lines = []
        for line in lines:
            line_utf8 = line.encode('utf-8')
            for replace_str in replace_strs:
                line_utf8 = line_utf8.replace(replace_str, b'')
            line = line_utf8.decode()
            line = re.sub("[ ]+", " ", line)  # 連続する空白を一つにする
            line = line.strip()
            if len(line) == 0:
                continue  # 空行は無視
            if is_float(line):
                continue  # 数字だけの行は無視
            new_lines.append(line)

        for index in range(len(new_lines)):
            if index == 0 and skip_header:
                continue
            if index == len(new_lines) - 1 and skip_footer:
                continue
            line = new_lines[index]
            # 見出しで改行
            if is_float(line.split(".")[0]) and len(
                    line.split()) < caption and (not line.endswith(".")):
                output += str(line)
                output += "\r\n"
                continue

            if line.endswith("."):
                output += str(line)
                output += "\r\n"
            elif line.endswith("-"):
                # 前の行からの続きの場合
                output += str(line[:-1])
            elif line.endswith(":"):
                # 数式が続く場合
                output += str(line)
                output += "\r\n"
            else:
                # それ以外の場合は、単語の切れ目として半角空白を入れる
                output += str(line)
                output += " "

    return output
Example #32
0
def request_pdf(url, case_id, court_name):
    try:
        response = requests.request("GET", url, verify=False, proxies=proxy_dict)
        if response.status_code == 200:
            res = response.text

            if res is None:
                logging.error("No data for: " + str(case_id))
                return "NULL"

            file_path = module_directory + "/../Data_Files/PDF_Files/" + court_name + "_" + slugify(case_id) + ".pdf"
            fw = open(file_path, "wb")
            fw.write(response.content)

            text_data = ""

            pdf_manager = PDFResourceManager()
            string_io = StringIO()
            pdf_to_text = TextConverter(pdf_manager, string_io, codec='utf-8', laparams=LAParams())
            interpreter = PDFPageInterpreter(pdf_manager, pdf_to_text)
            for page in PDFPage.get_pages(open(file_path, 'rb')):
                interpreter.process_page(page)
                text_data = string_io.getvalue()

            file_path = module_directory + "/../Data_Files/Text_Files/" + court_name + "_" + slugify(case_id) + ".txt"
            fw = open(file_path, "w")
            fw.write(str(text_data))

            return str(text_data)
        else:
            logging.error("Failed to get text file for: " + str(case_id))
            return "NULL"

    except Exception as e:
        logging.error("Failed to get pdf file for: " + str(case_id) + ". Error: %s", e)
        return "NULL"
Example #33
0
def extract_text(my_file):
    """Pulling text boxes out of PDFs. First half of this defn copies off the internet."""
    try:
        #my_file = os.path.join(base_path + "/" + filename)
        #my_file = os.path.join(dayDataPath, frontPages[paper])
        password = ""
        extracted_text = ""
        extracted_text_plus = []
        # Open and read the pdf file in binary mode
        fp = open(my_file, "rb")
        # Create parser object to parse the pdf content
        parser = PDFParser(fp)
        # Store the parsed content in PDFDocument object
        document = PDFDocument(parser, password)
        # Check if document is extractable, if not abort
        #if not document.is_extractable:
        #    raise PDFTextExtractionNotAllowed
        # Create PDFResourceManager object that stores shared resources such as fonts or images
        rsrcmgr = PDFResourceManager()
        # set parameters for analysis
        laparams = LAParams()
        # Create a PDFDevice object which translates interpreted information into desired format
        # Device needs to be connected to resource manager to store shared resources
        # device = PDFDevice(rsrcmgr)
        # Extract the decive to page aggregator to get LT object elements
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # Create interpreter object to process page content from PDFDocument
        # Interpreter needs to be connected to resource manager for shared resources and device
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Ok now that we have everything to process a pdf document, lets process it page by page
        for page in PDFPage.create_pages(document):
            # As the interpreter processes the page stored in PDFDocument object
            interpreter.process_page(page)
            # The device renders the layout from interpreter
            layout = device.get_result()
            # Out of the many LT objects within layout, we are interested in LTTextBox and LTTextLine
            for lt_obj in layout:
                #print(lt_obj)
                #extracted_text_plus.append(lt_obj)
                if isinstance(lt_obj, LTTextBox) or isinstance(
                        lt_obj, LTTextLine):
                    extracted_text_plus.append(lt_obj)
            #print(layout)
        #close the pdf file
        fp.close()
        #save the text
        #with open(log_file, "wb") as my_log:
        #    my_log.write(extracted_text.encode("utf-8"))

        ###Finally getting to my contributions.###
        #Headlines are assumed to be large text. By comparing the number of lines of text in a textbox
        #with the height of the textbox, the average size of the text can be found.
        #Text that's larger than average is kept.
        df = pd.DataFrame()
        df['cords'] = 0
        df['num'] = 0
        df['height'] = 0
        df['text'] = ''
        df['TL_X'] = -1
        df['TL_Y'] = -1
        df['width'] = -1
        nums = []
        heights = []
        for n in range(0, len(extracted_text_plus)):
            cords = str(extracted_text_plus[n]).split(' ')[1].split(',')
            vals = [float(elm) for elm in cords]
            a, b, c, d = vals
            text = ' '.join(str(extracted_text_plus[n]).split(' ')[2:])
            h = d - b  #float(cords[3])-float(cords[1])
            w = c - a
            #nums.append(n)
            #heights.append(h)
            #print(cords)
            df.loc[n, 'cords'] = ' '.join(cords)
            df.loc[n, 'num'] = n
            df.loc[n, 'height'] = h
            df.loc[n, 'width'] = w
            df.loc[n, 'TL_X'] = a
            df.loc[n, 'TL_Y'] = b
            df.loc[n, 'text'] = text
        df['newlines'] = 0
        for x in range(0, len(df)):
            df.loc[x, 'newlines'] = df.loc[x, 'text'].count('\\n')
        df['text height'] = df['height'] / df['newlines']
        return df
    except:
        pass
Example #34
0
def extract_text_from_pdf(pdf_path):
    '''
    Helper function to extract the plain text from .pdf files

    :param pdf_path: path to PDF file to be extracted (remote or local)
    :return: iterator of string of extracted text
    '''
    # https://www.blog.pythonlibrary.org/2018/05/03/exporting-data-from-pdfs-with-python/
    if not isinstance(pdf_path, io.BytesIO):
        # extract text from local pdf file
        with open(pdf_path, 'rb') as fh:
            try:
                for page in PDFPage.get_pages(
                                fh,
                                caching=True,
                                check_extractable=True
                ):
                    resource_manager = PDFResourceManager()
                    fake_file_handle = io.StringIO()
                    converter = TextConverter(
                        resource_manager,
                        fake_file_handle,
                        codec='utf-8',
                        laparams=LAParams()
                    )
                    page_interpreter = PDFPageInterpreter(
                        resource_manager,
                        converter
                    )
                    page_interpreter.process_page(page)

                    text = fake_file_handle.getvalue()
                    yield text

                    # close open handles
                    converter.close()
                    fake_file_handle.close()
            except PDFSyntaxError:
                return
    else:
        # extract text from remote pdf file
        try:
            for page in PDFPage.get_pages(
                pdf_path,
                caching=True,
                check_extractable=True
            ):
                resource_manager = PDFResourceManager()
                fake_file_handle = io.StringIO()
                converter = TextConverter(
                    resource_manager,
                    fake_file_handle,
                    codec='utf-8',
                    laparams=LAParams()
                )
                page_interpreter = PDFPageInterpreter(
                    resource_manager,
                    converter
                )
                page_interpreter.process_page(page)

                text = fake_file_handle.getvalue()
                yield text

                # close open handles
                converter.close()
                fake_file_handle.close()
        except PDFSyntaxError:
            return
Example #35
0
args = parser.parse_args()

# Open a PDF file.
fp = open(args.filename, 'rb')
parser = PDFParser(fp)
document = PDFDocument(parser)
if not document.is_extractable:
    raise PDFTextExtractionNotAllowed
rsrcmgr = PDFResourceManager()

# BEGIN LAYOUT ANALYSIS
# Set parameters for analysis.
laparams = LAParams(
    line_overlap=0.1,
    char_margin=0.1,
    line_margin=0.5,
    word_margin=0.1,
    boxes_flow=0.5,
)

# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)

# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)

INFO_FIRST_ROW = 720
INFO_SECOND_ROW = 650
FUZZINESS = 14
FUZZINESS_X = FUZZINESS
FUZZINESS_Y = FUZZINESS
Example #36
0
def scientific_analysis(password, path, title, topn):
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfpage import PDFPage
    from io import StringIO

    print('Convering pdf to text ...')
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password_pdf = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password_pdf,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()
    fp.close()
    device.close()
    retstr.close()
    text = text.replace('-\n', '').replace('’', "'").replace('infl', 'infl')
    lines = text.split('\n')
    lines_section_ids_dict = {}
    lines_section_ids = []
    for i, line in enumerate(lines[1:-2]):
        if len(lines[i - 1]) == 0 and len(lines[i + 1]) == 0 and len(
                lines[i]) > 3 and not str(lines[i]).isdigit():
            lines_section_ids_dict[i] = lines[i]
            lines_section_ids.append(i)

    data = []
    for id in lines_section_ids_dict:
        data.append((lines_section_ids_dict[id], id))
    data = dict(data)

    final_data = {}
    new_txt = ''
    try:
        ref_id = data['References']
    except KeyError:
        ref_id = len(lines) - 1
    for i, id in enumerate(lines_section_ids):
        if i < len(lines_section_ids) - 1 and id < ref_id:
            start = lines_section_ids[i]
            end = lines_section_ids[i + 1]
            interval_lines = lines[start + 1:end]
            interval_lines_txt = ' '.join(interval_lines)
            if 'Abbreviations' not in lines_section_ids_dict[
                    start] and '18 of 36' not in lines_section_ids_dict[start]:
                new_txt += interval_lines_txt
            if interval_lines and len(interval_lines_txt) > 100:
                final_data[lines_section_ids_dict[start]] = ' '.join(
                    interval_lines)

    final_data['paper_title'] = title
    final_data['full_text'] = new_txt
    final_data['topn'] = topn
    print('Uploading text ...')
    response = requests.post(
        'http://tzagerlib1-env.eba-wjp8tqpj.eu-west-2.elasticbeanstalk.com/scientific_analysis/'
        + password,
        json=json.dumps(final_data))
    if response.status_code == 200:
        data = dict(response.json())
    else:
        data = {'error': response.status_code}
        data = dict(data)
    return data
Example #37
0
        elif str == "Backstroke":
            return "BackStroke"
        elif str == "Breaststroke":
            return "BreastStroke"
        elif str == "Individual":
            return "IndividualMedley"
        else:
            return str
    elif ir == "r":
        if str == "Freestyle":
            return "FreeRelay"
        elif str == "Medley":
            return "MedleyRelay"

# Layout Analysisのパラメーターを設定。縦書きの検出を有効にする。
laparams = LAParams(detect_vertical=True)

# 共有のリソースを管理するリソースマネージャーを作成。
resource_manager = PDFResourceManager()

# ページを集めるPageAggregatorオブジェクトを作成。
device = PDFPageAggregator(resource_manager, laparams=laparams)

# Interpreterオブジェクトを作成。
interpreter = PDFPageInterpreter(resource_manager, device)

# 出力用のテキストファイル
filename = os.path.basename(sys.argv[1])
outputfilename = "output/text/" + os.path.splitext(filename)[0] + "_out.txt"

outputfile = open(outputfilename, 'w')
Example #38
0
    def __init__(
        self,
        file,
        merge_tags=('LTChar', 'LTAnno'),
        round_floats=True,
        round_digits=3,
        input_text_formatter=None,
        normalize_spaces=True,
        resort=True,
        parse_tree_cacher=None,
    ):
        # store input
        self.merge_tags = merge_tags
        self.round_floats = round_floats
        self.round_digits = round_digits
        self.resort = resort

        # set up input text formatting function, if any
        if input_text_formatter:
            self.input_text_formatter = input_text_formatter
        elif normalize_spaces:
            r = re.compile(r'\s+')
            self.input_text_formatter = lambda s: re.sub(r, ' ', s)
        else:
            self.input_text_formatter = None

        # open doc
        if not hasattr(file, 'read'):
            try:
                file = open(file, 'rb')
            except TypeError:
                raise TypeError("File must be file object or filepath string.")

        parser = PDFParser(file)
        if hasattr(QPDFDocument, 'set_parser'):
            # pdfminer < 20131022
            doc = QPDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
        else:
            # pdfminer >= 20131022
            doc = QPDFDocument(parser)
            parser.set_document(doc)
        if hasattr(doc, 'initialize'):
            # as of pdfminer==20140328, "PDFDocument.initialize() method is
            # removed and no longer needed."
            doc.initialize()
        self.doc = doc
        self.parser = parser
        self.tree = None
        self.pq = None
        self.file = file

        if parse_tree_cacher:
            self._parse_tree_cacher = parse_tree_cacher
            self._parse_tree_cacher.set_hash_key(self.file)
        else:
            self._parse_tree_cacher = DummyCache()

        # set up layout parsing
        rsrcmgr = PDFResourceManager()
        laparams = LAParams(all_texts=True, detect_vertical=True)
        self.device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)

        # caches
        self._pages = []
        self._pages_iter = None
        self._elements = []
Example #39
0
def main(argv):
    import getopt

    def usage():
        print(
            'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
            '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
            '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...'
            % argv[0])
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:],
                                     'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              codec=codec,
                              laparams=laparams,
                              imagewriter=imagewriter)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file(fname, 'rb')
        process_pdf(rsrcmgr,
                    device,
                    fp,
                    pagenos,
                    maxpages=maxpages,
                    password=password,
                    caching=caching,
                    check_extractable=True)
        fp.close()
    device.close()
    outfp.close()
    return
Example #40
0
def parse():
    fp = open(path, 'rb') # 以二进制读模式打开
    #用文件对象来创建一个pdf文档分析器
    praser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器 与文档对象
    praser.set_document(doc)
    doc.set_parser(praser)

    # 提供初始化密码
    # 如果没有密码 就创建一个空的字符串
    doc.initialize()

    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf 资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        pdftext=''
        # 循环遍历列表,每次处理一个page的内容
        pagei=1
        x1 = 1
        for page in doc.get_pages(): # doc.get_pages() 获取page列表
            print(x1)
            x1=x1 +1
            interpreter.process_page(page)
            # 接受该页面的LTPage对象
            layout = device.get_result()
            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    #with open(r'E:\scrapy\json\1.txt', 'a') as f:
                        str = x.get_text()
                        results=str
                        #results = str.replace('\n', ',')
                        #print(results)
                     #   f.write(results + '\n')
                        if (str.find('601668')>0):
                            print(str)

                            break
                        #pdftext=pdftext+results
            pdftext = pdftext+'\n'
        pagei = pagei+1
    print(pagei)
    start_keyword='重仓线'
    end_keyword='1,建仓线是指值得买入的价位,这个价位是相对低位,不存在追高风险。'
#    pdftext = pdftext.replace('\n', '')
    pat = re.compile(start_keyword + '(.*?)' + end_keyword, re.S)
    result = pat.findall(pdftext)

    print('result',result)
    filename=r'E:\scrapy\json\tushare.csv'
    convert2csv(result,filename)
Example #41
0
# PDF文档的对象
doc = PDFDocument()

# 链接解释器和文档
parser.set_document(doc)
doc.set_parser(parser)

# 初始化文档
doc.initialize('')
# 没有密码,空字符串

# 创建PDF资源管理器
resource = PDFResourceManager()

# 参数分析器
laparam = LAParams()

# 创建一个聚合器
device = PDFPageAggregator(resource, laparams=laparam)

# 创建页面解释器
interpreter = PDFPageInterpreter(resource, device)

# 使用文档对象读取内容
for page in doc.get_pages():
    # 使用页面解释器读取
    interpreter.process_page(page)

    # 使用聚合器获得内容
    layout = device.get_result()
Example #42
0
def main(argv):
    import getopt
    def usage():
        print(f'usage: {argv[0]} [-P password] [-o output] [-t text|html|xml|tag]'
               ' [-O output_dir] [-c encoding] [-s scale] [-R rotation]'
               ' [-Y normal|loose|exact] [-p pagenos] [-m maxpages]'
               ' [-S] [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin]'
               ' [-W word_margin] [-F boxes_flow] [-d] input.pdf ...')
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'dP:o:t:O:c:s:R:Y:p:m:SCnAVM:W:L:F:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = b''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    encoding = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-P': password = v.encode('ascii')
        elif k == '-o': outfile = v
        elif k == '-t': outtype = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-c': encoding = v
        elif k == '-s': scale = float(v)
        elif k == '-R': rotation = int(v)
        elif k == '-Y': layoutmode = v
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-S': stripcontrol = True
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = open(outfile, 'w', encoding=encoding)
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=stripcontrol)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, scale=scale,
                               layoutmode=layoutmode, laparams=laparams,
                               imagewriter=imagewriter, debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp)
    else:
        return usage()
    for fname in args:
        with open(fname, 'rb') as fp:
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.get_pages(fp, pagenos,
                                          maxpages=maxpages, password=password,
                                          caching=caching, check_extractable=True):
                page.rotate = (page.rotate+rotation) % 360
                interpreter.process_page(page)
    device.close()
    outfp.close()
    return
Example #43
0
def get_text_and_coordinates(pdf_path):
    # Extract the room prefix from level in the pdf_path
    room_prefix = int(pdf_path.split(os.sep)[-1].split('-')[-1][:1]) - 1

    # Open a PDF file.
    fp = open(pdf_path, 'rb')

    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)

    # Create a PDF document object that stores the document structure.
    # Password for initialization as 2nd parameter
    document = PDFDocument(parser)

    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed

    # Create a PDF resource manager object that stores shared resources.
    resource_manager = PDFResourceManager()

    # BEGIN LAYOUT ANALYSIS
    # Set parameters for analysis.
    la_params = LAParams()

    # Create a PDF page aggregator object.
    device = PDFPageAggregator(resource_manager, laparams=la_params)

    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(resource_manager, device)

    def parse_obj(lt_objects):

        # (x0, y0) = Bottom left corner, (x1, y1) = Top right corner
        df_dictionary = {
            'x0': [],
            'y0': [],
            'x1': [],
            'y1': [],
            'width': [],
            'height': [],
            'text': []
        }

        # loop over the object list
        for obj in lt_objects:

            # if it's a textbox, print text and location
            if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
                # Use some basic filtering: Remove letters, add hyphens, ignore combined rooms
                text = re.sub('[^0-9]', '', obj.get_text())

                if not text.startswith(str(room_prefix)):
                    continue  # Ignore noise that gives room numbers that cannot possibly belong to the floor

                text_len = len(text)

                if text_len > 0:
                    bbox = obj.bbox
                    width = bbox[2] - bbox[0]
                    height = bbox[3] - bbox[1]

                    if text_len == 5:
                        text = text[:3] + '.' + text[3:]
                    elif text_len > 5 or text_len < 3:
                        continue  # Currently just ignoring those few rooms which are problematic

                    df_dictionary['x0'].append(bbox[0])
                    df_dictionary['y0'].append(bbox[1])
                    df_dictionary['x1'].append(bbox[2])
                    df_dictionary['y1'].append(bbox[3])
                    df_dictionary['width'].append(width)
                    df_dictionary['height'].append(height)
                    df_dictionary['text'].append(text)

            # if it's a container, recurse
            elif isinstance(obj, pdfminer.layout.LTFigure):
                parse_obj(obj._objs)

        return pd.DataFrame.from_dict(df_dictionary)

    # loop over all pages in the document
    for page in PDFPage.create_pages(document):
        # read the page into a layout object
        interpreter.process_page(page)
        layout = device.get_result()
        # extract text from this object
        df = parse_obj(layout._objs)
        return df
def processAddendaPdf(absDocUrl):
    print 'Parsing addenda pdf %s ...' % absDocUrl
    pdfbin = urllib.urlopen(absDocUrl).read()
    cin = StringIO.StringIO()
    cin.write(pdfbin)
    cin.seek(0)
    parser = PDFParser(cin)
    doc = PDFDocument()
    parser.set_document(doc)
    try:
        doc.set_parser(parser)

        doc.initialize()

        assert doc.is_extractable

        #    outlines = doc.get_outlines()
        #    for (level,title,dest,a,se) in outlines:
        #        print (level, title)

        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.
        revHistFound = False
        revision = []
        for page in doc.get_pages():
            interpreter.process_page(page)
            layout = device.get_result()
            xlines, ylines, tlines = extractLinesText(layout)
            for tline in tlines:
                lineText = tline.get_text()
                #print lineText
                if not revHistFound:
                    match = re.match('.*HISTORY OF REVISIONS.*', lineText)
                    if match:
                        revHistFound = True
                        print 'Revision History found'
                else:
                    match = re.match('(?P<version_info>\d+)\s*', lineText)
                    if match:
                        versionInfo = match.group('version_info')
                        revision.append(versionInfo)
                        if len(revision) == 2:
                            break
            if len(revision) > 0:
                break

        if len(revision) < 2:
            raise BaseException('Could not find revision info')
        else:
            revision = '.'.join(sorted(revision))

    except BaseException as e:
        print 'ERROR: %s' % str(e)
        revision = 'ERROR while parsing the PDF: %s' % str(e)

    print 'Revision: %s' % revision
    data = {'revision': revision}
    return data
Example #45
0
def parse():
    fp = open(path, 'rb')  # 以二进制读模式打开
    #用文件对象来创建一个pdf文档分析器
    praser = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument(praser)
    # 连接分析器 与文档对象
    praser.set_document(doc)
    # 创建PDf 资源管理器 来管理共享资源
    rsrcmgr = PDFResourceManager()
    # 创建一个PDF设备对象
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    # 创建一个PDF解释器对象
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # 循环遍历列表,每次处理一个page的内容

    wb = Workbook()  #新建excel
    ws = wb.active

    # 记录page的行数
    text_number = 0

    for page in PDFPage.create_pages(doc):  # doc.get_pages() 获取page列表
        interpreter.process_page(page)
        # 接受该页面的LTPage对象
        layout = device.get_result()
        # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
        # 得到box
        page_container = []  #存储所有该page的字符串字典
        page_rows = []  #存储行位置数据
        for text_box in layout:
            if (isinstance(text_box, LTTextBox)):
                # 得到line
                for text_line in text_box:
                    if (isinstance(text_line, LTTextLine)):
                        # 得到每个字符
                        temp = []  # 存储得到的字符
                        temp_loc = []  #存储字符串位置
                        isfirst = True  #判断是否为字符串的第一个字符
                        for text_index in text_line:
                            # 判断是否为字符数据,并不断更新temp temp_loc
                            if (isinstance(text_index, LTChar)):
                                temp.append(text_index.get_text())
                                if isfirst == True:
                                    temp_loc.append(
                                        round(text_index.bbox[0], 3))
                                    temp_loc.append(
                                        round(text_index.bbox[1], 3))
                                    temp_loc.append(
                                        round(text_index.bbox[2], 3))
                                    temp_loc.append(
                                        round(text_index.bbox[3], 3))
                                    isfirst = False
                                temp_loc[2] = round(text_index.bbox[2], 3)
                                temp_loc[3] = round(text_index.bbox[3], 3)
                            # 判断是否为LTText,并将得到的字符串输入page_container的指定位置,最后更新temp 、temp_loc、 isfirst
                            elif (isinstance(text_index, LTText)):
                                # 如果page_rows没有该行的位置数据,则将数据信息插入page_container,page_rows
                                # if temp_loc[1] not in page_rows:
                                if is_not_in(page_rows, temp_loc[1]):
                                    insert_loc = insert_into_page_rows(
                                        page_rows, temp_loc[1])
                                    page_container.insert(
                                        insert_loc, [{
                                            'value': ''.join(temp),
                                            'location': temp_loc
                                        }])
                                    # page_rows.append(temp_loc[1])
                                    # page_container.append([{'value':''.join(temp),'location':temp_loc}])
                                # 如果有该行的信息
                                elif not is_not_in(page_rows, temp_loc[1]):
                                    # loc = page_rows.index(temp_loc[1])
                                    loc = get_page_rows_loc(
                                        page_rows, temp_loc[1])
                                    temp_list = insert_into_page_container(
                                        page_container[loc], {
                                            'value': ''.join(temp),
                                            'location': temp_loc
                                        })
                                    page_container[loc] = temp_list[:]
                                temp = []
                                temp_loc = []
                                isfirst = True
        rows_num = len(page_container)

        # 对最后一行进行重排
        if len(page_container[rows_num - 1]) != len(
                page_container[rows_num - 2]):
            loc_for_no2 = []
            loc_for_no1 = []
            adjust_for_no1 = []
            temp_array = page_container[rows_num - 1][:]
            for i in page_container[rows_num - 2]:
                loc_for_no2.append([i['location'][0], i['location'][2]])
            for i in page_container[rows_num - 1]:
                loc_for_no1.append([i['location'][0], i['location'][2]])
            for i in range(len(loc_for_no1)):
                for j in range(len(loc_for_no2)):
                    if not (loc_for_no1[i][0] > loc_for_no2[j][1]
                            or loc_for_no1[i][1] < loc_for_no2[j][0]):
                        adjust_for_no1.append(j)
                        break

            page_container[rows_num - 1] = []
            for i in range(len(page_container[rows_num - 2])):
                if i in adjust_for_no1:
                    page_container[rows_num - 1].append(
                        temp_array[adjust_for_no1.index(i)])
                else:
                    page_container[rows_num - 1].append(None)

        # 对前五行进行重排
        if len(page_container[0]) != len(page_container[1]) or len(
                page_container[1]) != len(page_container[2]) or len(
                    page_container[2]) != len(page_container[3]) or len(
                        page_container[3]) != len(page_container[4]):
            rows_length = []
            the_max_row = []
            new_max_row = []
            for i in range(6):
                rows_length.append(len(page_container[i]))
            max_length = max(rows_length)
            the_max_row = page_container[rows_length.index(max_length)][:]
            for i in range(len(rows_length)):
                if rows_length[i] < max_length:
                    page_container[i] = align_row(the_max_row,
                                                  page_container[i])
        # 检测表头

        # 输出验证
        for i in range(len(page_container)):
            for j in range(len(page_container[i])):
                print(page_container[i][j])
        # print(page_container)
        # print(page_rows)

        # 得到该页数据以后写入excel
        for i in range(len(page_container)):
            for j in range(len(page_container[i])):
                cell_index = ws.cell(row=i + 1 + text_number, column=j + 1)
                if page_container[i][j] == None:
                    cell_index.value = ' '
                else:
                    cell_index.value = page_container[i][j]['value']

        # 更新text_number,保证page之间的数据连续
        text_number += rows_num

    wb.save(r'C:\Users\15644\Desktop\pdf_file\test_pdf_list\test_1.xlsx')
Example #46
0
def anotate_pdf(file_path, sht, query_dict):

    # preparing the output file name
    path = pathlib.Path(file_path).parent
    extension = pathlib.Path(file_path).suffix
    name = pathlib.Path(file_path).name[:-len(extension)]
    result_file = str(path) + '\\' + name + '_highlighted' + extension

    #=========================================================

    # create a parser object associated with the file object
    parser = PDFParser(open(file_path, 'rb'))
    # create a PDFDocument object that stores the document structure
    doc = PDFDocument(parser)

    # Layout Analysis
    # Set parameters for analysis.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    rsrcmgr = PDFResourceManager()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # create pdf layout - this is list with layout of every page
    layout = []
    for page in PDFPage.create_pages(doc):
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout.append(device.get_result())

    # add tooltip info not sure how to use this option in the most usefull way
    m_meta = {"author": "AK", "contents": "HL text1"}

    outputStream = open(result_file, "wb")
    pdfInput = PdfFileReader(open(file_path, 'rb'), strict=True)
    pdfOutput = PdfFileWriter()

    npage = pdfInput.numPages
    for pgn in range(0, npage):
        for query in query_dict:
            all_coor = []
            for page in layout:
                result = get_page_coordinates(page, query)
                all_coor.append(result)

            page_hl = pdfInput.getPage(pgn)

            for item in all_coor[pgn]:
                highlight = create_highlight(item[0],
                                             item[1],
                                             item[2],
                                             item[3],
                                             m_meta,
                                             color=query_dict[query])
                highlight_ref = pdfOutput._addObject(highlight)

                if "/Annots" in page_hl:
                    page_hl[NameObject("/Annots")].append(highlight_ref)
                else:
                    page_hl[NameObject("/Annots")] = ArrayObject(
                        [highlight_ref])

        pdfOutput.addPage(page_hl)

    # save HL to new file
    pdfOutput.write(outputStream)
    outputStream.close()
    sht.range('B2').value = f'File {name+extension} completed'
Example #47
0
def parse(file_name):
    fp = open(file_name, 'rb')

    praser = PDFParser(fp)

    doc = PDFDocument()

    praser.set_document(doc)
    doc.set_parser(praser)
    useful = []
 
    doc.initialize()

    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        
        rsrcmgr = PDFResourceManager()
     
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        
        interpreter = PDFPageInterpreter(rsrcmgr, device)


        page_number = 0
        
        temp_use = []
        temp_dict = {
                    "name":"",
                    "LASID":"",
                    "DOB":"",
                    "Grade":"",
                    "RD":"",
                    "School":"",
                    "District":"",
                    "Score":"",
                    "Score_level":"",
                    "low_top":"",
                    "course":"",
                } 
        for page in doc.get_pages():
            interpreter.process_page(page)
            
            layout = device.get_result()
            
            read_flag = 0
            for x in layout:
                if (isinstance(x, LTTextBoxHorizontal)):
                    results = x.get_text()
                    print (results)
                    if page_number%2 == 0 and read_flag==0:
                        #temp_use.append(results)
                        temp_dict["course"] = results.split("\n")[0]
                        read_flag = 1
                        continue
                    else:
                        if get_name(results):
                            temp_dict["name"] = get_name(results) 
                        if get_LASID(results):
                            temp_dict["LASID"] = get_LASID(results)
                        if get_DOB(results):
                            temp_dict["DOB"] = get_DOB(results)
                        if get_Grade(results):
                            temp_dict["Grade"] = get_Grade(results)
                        if get_RD(results):
                            temp_dict["RD"] = get_RD(results)
                        if get_School(results):
                            temp_dict["School"] = get_School(results)
                        if get_District(results):
                            temp_dict["District"] = get_District(results)
                        if get_Score(results):
                            temp_dict["Score"] = get_Score(results)
                        if get_Score_level(results):
                            temp_dict["Score_level"] = get_Score_level(results)
                            #print ("hhhh")
                        if get_low_top(results):
                            temp_dict["low_top"] = get_low_top(results)
                    #print (temp_dict)
                    #input("==")
            #page_number += 1
            #print (page_number)
            #if page_number%2 == 0:
            if 1:
                #print (temp_dict)
                useful.append(temp_dict)
                #input("=======")
                temp_dict = {
                    "name":"",
                    "LASID":"",
                    "DOB":"",
                    "Grade":"",
                    "RD":"",
                    "School":"",
                    "District":"",
                    "Score":"",
                    "Score_level":"",
                    "low_top":"",   
                } 
    return useful
Example #48
0
def parse():
    with open("schedule/{}".format(cfg.get("schedule_file")), "rb") as fp:
        parser = PDFParser(fp)
        document = PDFDocument(parser)
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        rsrcmgr = PDFResourceManager()
        device = PDFDevice(rsrcmgr)
        laparams = LAParams()
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        def parse_obj(lt_objs):
            for obj in lt_objs:
                if isinstance(obj, LTTextBoxHorizontal):
                    coor = getTextCoords(obj.bbox[0:2])
                    text = obj.get_text().replace('\n', ' ')
                    # check if content contains a date
                    match = re.search(r"\d{2}/\d{2}/\d{4}", text)
                    if match:
                        data["dates"].append({
                            "date": match.group(),
                            "coords": coor
                        })
                    match = re.findall(r"\d{1,2}:\d{2}", text)
                    if match:
                        data["hours"].append({
                            "hours":
                            list(map(lambda x: "{0:0>5}".format(x), match)),
                            "coords":
                            coor
                        })
                    data["textboxes"].append([coor, text, ""])

                if isinstance(obj, LTRect):
                    data["rects"].append(getRectCoords(obj.bbox[0:4]))

                if isinstance(obj, LTFigure):
                    parse_obj(obj._objs)

        if LOG_TEXTS:
            with open("outputs/" + cfg.get("folder") + "/pdf_texts.txt",
                      "w",
                      encoding="utf8") as log:
                log.write("")

        with open("outputs/" + cfg.get("folder") + "/pdf_svg.html",
                  "w",
                  encoding="utf8") as svg:
            ''' SVG HEAD '''
            if CREATE_SVG:
                svg.write(
                    "<style type=\"text/css\">svg{stroke:#000;stroke-width:1;fill:none}</style>\n"
                )
            i = 0

            # loop over all pages in the document
            for page in PDFPage.create_pages(document):
                # read the page into a layout object
                interpreter.process_page(page)
                layout = device.get_result()
                ''' CREATE SVG '''
                if CREATE_SVG:
                    svg.write(
                        "<svg id=\"s{}\" width=\"1200\" height=\"600\">\n".
                        format(i))

                data["rects"] = []
                data["textboxes"] = []
                data["dates"] = []
                data["datelines"] = []
                data["hours"] = []

                # extract info from this page
                parse_obj(layout._objs)

                lines = rectsToLines(data["rects"])

                lines = mergeLines(lines)
                lines.sort(key=lambda x: x[1][1])
                lines.sort(key=lambda x: x[0][1])

                grid = createGrid(lines)
                data["textboxes"] = mergeTexts(grid, data["textboxes"])
                data["textboxes"] = splitSimultaneousCourses(data["textboxes"])

                data["hours"].sort(key=lambda x: x["coords"][1])

                if data["hours"]:
                    calcHourBoundaries(grid)
                if data["dates"]:
                    calcDateBoundaries(grid)

                # keyword matching for each textbox
                for t in data["textboxes"]:
                    t[1] = " ".join(t[1].split())
                    res = keywords.match(format_text(t[1]))
                    if len(res["indexes"]) == 1:
                        data["courses"][res["indexes"][0]] = {
                            "coords": t[0],
                            "date": getDate(t[0])
                        }
                        t[2] = " (match: {})".format(res["titles"][0])
                ''' DRAW LINES '''
                if CREATE_SVG:
                    minX, maxX = 1e10, 0
                    for l in lines:
                        svg.write(
                            "<line x1=\"{}\" y1=\"{}\" x2=\"{}\" y2=\"{}\" stroke=\"#{}\"></line>\n"
                            .format(l[0][0], l[0][1], l[1][0], l[1][1],
                                    randomColor()))
                        if l[0][0] < minX:
                            minX = l[0][0]
                        if l[1][0] > maxX:
                            maxX = l[1][0]
                    if SHOW_DATELINES:
                        for h in data["hours"]:
                            svg.write(
                                "<circle cx=\"{}\" cy=\"{}\" r=\"1\" stroke=\"red\"></circle>\n"
                                .format(h["coords"][0], h["coords"][1]))
                        for d in data["dates"]:
                            if d["boundaries"][0] != 0 and d["boundaries"][
                                    1] != 0:
                                svg.write(
                                    "<line x1=\"{}\" y1=\"{}\" x2=\"{}\" y2=\"{}\" stroke=\"#111111\"></line>\n"
                                    .format(minX, d["boundaries"][0], maxX,
                                            d["boundaries"][0]))
                                svg.write(
                                    "<line x1=\"{}\" y1=\"{}\" x2=\"{}\" y2=\"{}\" stroke=\"#111111\"></line>\n"
                                    .format(minX, d["boundaries"][1], maxX,
                                            d["boundaries"][1]))
                    if SHOW_TEXTBOXES:
                        for t in data["textboxes"]:
                            svg.write(
                                "<text x=\"{}\" y=\"{}\" font-size=\"4\" font-weight=\"lighter\">{}</text>\n"
                                .format(t[0][0], t[0][1], t[1][:5]))
                if LOG_TEXTS:
                    with open("outputs/" + cfg.get("folder") +
                              "/pdf_texts.txt",
                              "a",
                              encoding="utf8") as log:
                        for t in data["textboxes"]:
                            log.write("{}, {}, {}{}\n".format(
                                t[0][0], t[0][1], t[1], t[2]))
                ''' CLOSE SVG '''
                if CREATE_SVG:
                    svg.write('</svg>' + "\n")
                i += 1

        coursedates = {}
        for key, c in data["courses"].items():
            coursedates[key] = c["date"]
        write(coursedates)
Example #49
0
def pdf2text(stream: IO[bytes]) -> TextIOWrapper:
    """Return a text stream from a PDF stream."""
    bytes_stream = BytesIO()
    extract_text_to_fp(stream, bytes_stream, laparams=LAParams())
    bytes_stream.seek(0)
    return TextIOWrapper(bytes_stream, "utf-8")
Example #50
0
def pdf_to_csv(filename, separator, threshold):
    #from cStringIO import StringIO
    from pdfminer.converter import LTChar, TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.pdfpage import PDFPage

    class CsvConverter(TextConverter):
        def __init__(self, *args, **kwargs):
            TextConverter.__init__(self, *args, **kwargs)
            self.separator = separator
            self.threshold = threshold

        def end_page(self, i):
            from collections import defaultdict
            lines = defaultdict(lambda: {})
            for child in self.cur_item._objs:  # <-- changed
                if isinstance(child, LTChar):
                    (_, _, x, y) = child.bbox
                    line = lines[int(-y)]
                    line[x] = child._text.encode(self.codec)  # <-- changed
            for y in sorted(lines.keys()):
                line = lines[y]
                self.line_creator(line)
                self.outfp.write(self.line_creator(line))
                self.outfp.write("\n")

        def line_creator(self, line):
            keys = sorted(line.keys())
            # calculate the average distange between each character on this row
            average_distance = sum(
                [keys[i] - keys[i - 1]
                 for i in range(1, len(keys))]) / len(keys)
            # append the first character to the result
            result = [line[keys[0]]]
            for i in range(1, len(keys)):
                # if the distance between this character and the last character is greater than the average*threshold
                if (keys[i] - keys[i - 1]) > average_distance * self.threshold:
                    # append the separator into that position
                    result.append(self.separator)
                # append the character
                result.append(line[keys[i]])
            printable_line = ''.join(result)
            return printable_line

    # ... the following part of the code is a remix of the
    # convert() function in the pdfminer/tools/pdf2text module
    rsrc = PDFResourceManager()

    ft = 'txt\\' + filename + '.txt'
    outfp = open(ft, 'w')

    #outfp = StringIO()
    device = CsvConverter(rsrc, outfp, codec="utf-8", laparams=LAParams())
    # becuase my test documents are utf-8 (note: utf-8 is the default codec)

    fp = open(filename, 'rb')

    interpreter = PDFPageInterpreter(rsrc, device)
    for i, page in enumerate(PDFPage.get_pages(fp)):
        outfp.write("START PAGE %d\n" % i)
        if page is not None:
            interpreter.process_page(page)
        else:
            print 'none'
        outfp.write("END PAGE %d\n" % i)

    device.close()
    fp.close()
    outfp.close()
    #return outfp.getvalue()
    return 0
Example #51
0
from io import StringIO

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

output_string = StringIO()
with open('simple1.pdf', 'rb') as in_file:
    parser = PDFParser(in_file)
    doc = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.create_pages(doc):
        interpreter.process_page(page)

print(output_string.getvalue())
Example #52
0
def createDeviceInterpreter():
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    return device, interpreter
Example #53
0
import matplotlib.patches as patches

from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine

with open('/home/cyan/Downloads/Barron-s-1100-Words-You-Need-table.pdf',
          'rb') as pdf_doc:
    parser = PDFParser(pdf_doc)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    laparams.char_margin = 1.0
    laparams.word_margin = 1.0
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    extracted_text = []

    for page in doc.get_pages():
        page_text = []
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox):
                page_text.append(lt_obj)
        extracted_text.append(page_text)
Example #54
0
                        #                 print("aaaa")
                        #             # print ("fontname %s"%c.fontname)
                        #             # print ("fontname %s"%c.fontsize)
            # if it's a container, recurse
            elif isinstance(objs[i], pdfminer.layout.LTFigure):
                parse_obj_title(objs[i]._objs)
            else:
                pass
    # print("adhjabjha",new_text)
    return new_text

document = open('2b_Agents.pdf', 'rb')
#Create resource manager
rsrcmgr = PDFResourceManager()
# Set parameters for analysis.
laparams = LAParams()
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
pages = PDFPage.create_pages(document)
# interpreter.process_page(pages)

#page is the iterator of the pages, it is for one single page object
title = []
content = []
for page in PDFPage.get_pages(document):
    interpreter.process_page(page)
    layout = device.get_result()
    if layout.pageid > 1:
        # print ("aaa")
        if parse_obj(layout._objs) == True:
Example #55
0
def convert_pdf_to_txt_csv(path):
    retstr = io.StringIO()
    codec = 'utf-8'

    df = pd.DataFrame()

    fp = open(path, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    laparams.char_margin = 1.0
    laparams.word_margin = 1.0
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    extracted_text = ''

    #Create DataFrame
    df = pd.DataFrame()

    page_list = []
    sent_list = []
    n = 0
    for page in doc.get_pages():
        n += 1
        interpreter.process_page(page)
        layout = device.get_result()
        for lt_obj in layout:
            if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
                t = lt_obj.get_text()
                t = re.sub('\\d+', ' ', t)
                t = t.replace('.....', ' ')
                t = re.sub('[\s+]', ' ', t)
                t = re.sub(' +', ' ', t)
                t = sent_tokenize(t)
                if len(t) > 0:
                    for each in t:
                        sent_list.append(each)
                        page_list.append(n)
                else:
                    t = ''.join(t)
                    page_list.append(n)
                    sent_list.append(t)

    df['Page No.'] = page_list
    df['Sentence'] = sent_list

    #Clean dataframe
    df = df[df.Sentence != ' ']

    #Filter sentence which len() >
    mask = (df['Sentence'].str.len() > 10)
    df = df.loc[mask]

    #Remove duplicate
    df = df.drop_duplicates('Sentence', keep='first', inplace=False)

    #Set Sentencec No. of Doc
    df['Sentence No. of Doc'] = df.groupby('Page No.').cumcount() + 1

    #FileName
    fileName = os.path.basename(path)
    df['Source Document'] = fileName

    #Country Name
    Country = fileName.split('_')[0]
    df['Country'] = Country

    fp.close()
    return df
Example #56
0
    def pdf2txt(self):
        '''
        =============================

        return : str, text File path
        '''

        # input
        password = ''
        pagenos = set()
        maxpages = 0

        # output
        imagewriter = None
        rotation = 0
        codec = 'UTF-8'
        pageno = 1
        scale = 1
        caching = True
        showpageno = True
        laparams = LAParams()

        infp = open(self.input_path, "rb")

        if self.output_path == None:
            self.output_path = self.input_path[:-4] + '_trans.txt'
            outfp = open(self.output_path, "w", encoding='UTF8')
        else:
            outfp = open(self.output_path, "w", encoding='UTF8')

        #page total num
        parser = PDFParser(infp)
        document = PDFDocument(parser)
        page_total_num = resolve1(document.catalog['Pages'])['Count']

        #
        rsrcmgr = PDFResourceManager(caching=caching)

        # pdf -> text converter
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)

        # pdf -> text interpreter
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        # pdf -> text start
        with tqdm(total=page_total_num) as pbar:
            for page in PDFPage.get_pages(infp,
                                          pagenos,
                                          maxpages,
                                          password=password,
                                          caching=caching,
                                          check_extractable=True):

                page.rotate = (page.rotate + rotation) % 360
                interpreter.process_page(page)

                pbar.update(1)

        print('[INFO] pdf -> text')

        outfp.close()
        infp.close()

        return self.output_path
Example #57
0
def parse(DataIO, save_path, start=None, end=None):
    # 用文件对象创建一个PDF文档分析器
    parser = PDFParser(DataIO)
    # 创建一个PDF文档
    doc = PDFDocument(parser)
    #分析器和文档相互连接
    parser.set_document(doc)
    #doc.set_parser(parser)
    # 提供初始化密码,没有默认为空
    #doc.initialize()
    # 检查文档是否可以转成TXT,如果不可以就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDF资源管理器,来管理共享资源
        rsrcmagr = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        # 将资源管理器和设备对象聚合
        device = PDFPageAggregator(rsrcmagr, laparams=laparams)
        # 创建一个PDF解释器对象
        interpreter = PDFPageInterpreter(rsrcmagr, device)

        # 循环遍历列表,每次处理一个page内容
        #pages = PDFPage.get_pages(doc)
        # doc.get_pages()获取page列表
        #for page in pages:
        page_num = 0
        for page in PDFPage.create_pages(doc):
            page_num = page_num + 1
            if start is not None and end is not None:
                if page_num < start:
                    continue
                if page_num > end:
                    break
            interpreter.process_page(page)
            # 接收该页面的LTPage对象
            layout = device.get_result()
            f = open('./text/' + str(page_num) + '.txt', 'w')
            #with open('%s' % (save_path), 'a') as f:

            # 这里的layout是一个LTPage对象 里面存放着page解析出来的各种对象
            # 一般包括LTTextBox,LTFigure,LTImage,LTTextBoxHorizontal等等一些对象
            # 想要获取文本就得获取对象的text属性
            for x in layout:
                #try:
                if isinstance(x, LTTextBoxHorizontal):
                    # 得到文本
                    result = x.get_text()
                    try:
                        print(
                            "***************** LTTextBoxHorizontal  ************"
                        )
                        print(result)
                        #if len(result) >= 15:
                        # 写到文件中
                        f.write(result + "\n")
                    except:
                        print('写入文件错误', result)
                        pass
                if isinstance(x, LTTextBox):
                    print("***************** LTTextBox  ************")
                    print(x.get_text())
                if isinstance(x, LTFigure):
                    print("***************** LTFigure  ************")
                    parse_lt_figure(x, page_num, f)
                if isinstance(x, LTImage):
                    print("***************** LTImage  ************")
                    saved_file = save_image(x, page_num)
                    print('save image ' + x.name)
                if isinstance(x, LTChar):
                    print('ppppppppppppppp')
                    print(x.get_text())
                    f.write(x.get_text())
                if isinstance(x, LTCurve):
                    print("***************** LTCurve  ************")
            f.close()
Example #58
0
def pdf2csv(fp):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    doc = PDFDocument()
    # Connect the parser and document objects.
    parser.set_document(doc)
    doc.set_parser(parser)
    # Supply the password for initialization.
    # (If no password is set, give an empty string.)
    doc.initialize('')
    # Check if the document allows text extraction. If not, abort.
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    # Create a PDF resource manager object that stores shared resources.
    rsrcmgr = PDFResourceManager()
    # Set parameters for analysis.
    laparams = LAParams()
    # Create a PDF page aggregator object.
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    for pageno, page in enumerate(doc.get_pages()):
        interpreter.process_page(page)
        # receive the LTPage object for the page.
        layout = device.get_result()
        #import code; code.interact(local=locals());
        hlines = []
        vlines = []
        for i in layout:
            if not type(i) in (LTRect, LTLine): continue
            hlines.append(int(i.x0))
            hlines.append(int(i.x1))
            vlines.append(int(layout.height - i.y0))
            vlines.append(int(layout.height - i.y1))
        hlines = filterclose(sorted(set(hlines)))
        vlines = filterclose(sorted(set(vlines)))
        print hlines
        print vlines
        print(layout.width, layout.height)
        i = 0
        im = Image.new('1', (int(layout.width), int(layout.height)))
        draw = ImageDraw.Draw(im)
        while (i < len(vlines) - 1):
            if not vlines[i + 1] - vlines[i] > 5:
                i = i + 1
                continue
            j = 0
            while (j < len(hlines) - 1):
                if not hlines[j + 1] - hlines[j] > 5:
                    j = j + 1
                    continue
                draw.rectangle([(int(hlines[j]), int(vlines[i])),
                                (int(hlines[j + 1]), int(vlines[i + 1]))],
                               outline=1)
                j = j + 1
            i = i + 1
        del draw
        fp = open("out%s.png" % pageno, 'wb')
        im.save(fp, "PNG")
        fp.close()
Example #59
0
def writeCSV(dirpdf):
    nameCSVfile = 'pdfFiles.csv'
    codec = 'utf-8'
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()
    save_dir = 'IMAGES'
    with open(nameCSVfile, 'w') as csvFile:

        fields = ['File', 'Kind', 'Text']
        # dir of folder and filter for pdf files
        files = [
            f for f in os.listdir(dirpdf)
            if os.path.isfile(os.path.join(dirpdf, f))
        ]
        files = list(filter(lambda f: f.endswith(('.pdf', '.PDF')), files))

        # variables for print information
        cnt_files = len(files)
        i = 0
        writer = csv.DictWriter(csvFile, fieldnames=fields)
        writer.writeheader()
        for filepdf in files:
            row = dict()
            try:
                filename = os.path.join(dirpdf, filepdf)
                fp = open(filename, 'rb')
                rsrcmgr = PDFResourceManager()
                retstr = StringIO()
                laparams = LAParams()
                device = TextConverter(rsrcmgr,
                                       retstr,
                                       codec=codec,
                                       laparams=laparams)
                # Create a PDF interpreter object.
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                # Process each page contained in the document.
                for page in PDFPage.get_pages(fp,
                                              pagenos,
                                              maxpages=maxpages,
                                              password=password,
                                              caching=caching,
                                              check_extractable=False):
                    try:
                        interpreter.process_page(page)
                        data = retstr.getvalue()
                        if (len(data) < 2 or len(data) > 100000):
                            base_filename = os.path.splitext(
                                os.path.basename(filename))[0] + '.jpg'
                            imgPath = os.path.join(save_dir, base_filename)
                            data = extract_text_image(imgPath)
                        row = [{
                            'File': filepdf,
                            'Kind': filepdf.split('.')[0],
                            'Text': data
                        }]
                    except Exception as ex:
                        print(filepdf)
                        print(ex)
                        base_filename = os.path.splitext(
                            os.path.basename(filename))[0] + '.jpg'
                        imgPath = os.path.join(save_dir, base_filename)
                        data = extract_text_image(imgPath)
                        row = [{
                            'File': filepdf,
                            'Kind': filepdf.split('.')[0],
                            'Text': data
                        }]
                    break
                # Cleanup
                device.close()
                retstr.close()
            except Exception as ex:
                print(filepdf)
                print(ex)
                row = [{
                    'File': filepdf,
                    'Kind': filepdf.split('.')[0],
                    'Text': 'Exception'
                }]
            i += 1
            # show an update every 50 pdf
            if (i > 0 and i % 50 == 0):
                print("[INFO] processed {}/{}".format(i, cnt_files))
            writer.writerows(row)
    csvFile.close()
Example #60
0
def cas_pdf_to_text(filename: Union[str, io.IOBase],
                    password) -> PartialCASData:
    """
    Parse CAS pdf and returns line data.

    :param filename: CAS pdf file (CAMS or Kfintech)
    :param password: CAS pdf password
    :return: array of lines from the CAS.
    """
    file_type: Optional[FileType] = None

    if isinstance(filename, str):
        fp = open(filename, "rb")
    elif isinstance(filename, io.IOBase):
        fp = filename
    elif hasattr(filename, "read"):  # compatibility for Django UploadedFile
        fp = filename
    else:
        raise CASParseError(
            "Invalid input. filename should be a string or a file like object")

    with fp:
        pdf_parser = PDFParser(fp)
        try:
            document = PDFDocument(pdf_parser, password=password)
        except PDFPasswordIncorrect:
            raise CASParseError("Incorrect PDF password!")
        except PDFSyntaxError:
            raise CASParseError("Unhandled error while opening file")

        line_margin = {
            FileType.KFINTECH: 0.1,
            FileType.CAMS: 0.2
        }.get(detect_pdf_source(document), 0.2)

        rsrc_mgr = PDFResourceManager()
        laparams = LAParams(line_margin=line_margin, detect_vertical=True)
        device = PDFPageAggregator(rsrc_mgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrc_mgr, device)

        pages: List[Iterator[LTTextBoxHorizontal]] = []

        investor_info = None
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)
            layout = device.get_result()
            text_elements = filter(
                lambda x: isinstance(x, LTTextBoxHorizontal), layout)
            if file_type is None:
                for el in filter(lambda x: isinstance(x, LTTextBoxVertical),
                                 layout):
                    if re.search("CAMSCASWS", el.get_text()):
                        file_type = FileType.CAMS
                    if re.search("KFINCASWS", el.get_text()):
                        file_type = FileType.KFINTECH
            if investor_info is None:
                investor_info = parse_investor_info(layout, *page.mediabox[2:])
            pages.append(text_elements)

        lines = group_similar_rows(pages)
        return PartialCASData(file_type=file_type,
                              investor_info=investor_info,
                              lines=lines)