Example #1
0
    def parse_pdf_pdfminer(self, f, fpath):
        try:
            list_pages = []
            laparams = LAParams()
            laparams.all_texts = True
            rsrcmgr = PDFResourceManager()
            pagenos = set()

            if self.dedup:
                self.dedup_store = set()

            self.handler.print_header(fpath)
            page_num = 0
            for page in PDFPage.get_pages(f, pagenos, check_extractable=True):
                page_num += 1

                retstr = StringIO()
                device = TextConverter(rsrcmgr, retstr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                interpreter.process_page(page)
                data = retstr.getvalue()
                retstr.close()
                list_pages.append(self.parse_page(fpath, data, page_num))
            self.handler.print_footer(fpath)
            return list_pages
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            self.handler.print_error(fpath, e)
Example #2
0
    def dump_pdf_pdfminer(self, fpath_in):
        fpath_out = os.path.splitext(fpath_in)[0] + ".txt"
        n = 0

        with open(fpath_in, 'rb') as fin:
            with open(fpath_out, 'wb') as fout:
                try:
                    laparams = LAParams()
                    laparams.all_texts = True  
                    rsrcmgr = PDFResourceManager()
                    pagenos = set()

                    page_num = 0
                    for page in PDFPage.get_pages(fin, pagenos, check_extractable=True):
                        page_num += 1

                        retstr = StringIO()
                        device = TextConverter(rsrcmgr, retstr, laparams=laparams)
                        interpreter = PDFPageInterpreter(rsrcmgr, device)
                        interpreter.process_page(page)
                        data = retstr.getvalue()
                        retstr.close()

                        fout.write(data)
                        n += len(data)
                    print "Written %d bytes to %s" % (n, fpath_out)
                except (KeyboardInterrupt, SystemExit):
                    raise
                except Exception as e:
                    print "Failed parsing %s" % (fpath_in)
Example #3
0
    def _convert_pdf_to_text(self, password=None):
    	input_pdf = self.cvFile
    	if password is not None:
	    self.cvFilePasswd = password
    	pagenos = range(0, 30)
    	maxpages = pagenos.__len__()
    	layoutmode = 'normal'
    	codec = 'utf-8'
    	scale = 1
    	outtype = 'txt'
    	laparams = LAParams()
    	laparams.all_texts = True
    	laparams.showpageno = True
    	outputPath = self.scratchDir
    	inputPath = os.getcwd()
    	if os.path.exists(input_pdf):
            inputPath = os.path.dirname(input_pdf)
    	input_filename = os.path.basename(input_pdf)
    	input_parts = input_filename.split(".")
    	input_parts.pop()
	randomStr = int(time.time())
    	output_filename = outputPath + os.path.sep + ".".join(input_parts) + randomStr.__str__() + r".txt"
	self.cvTextFile = output_filename
	outfp = file(output_filename, 'w')
    	rsrcmgr = PDFResourceManager()
    	device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    	fp = file(input_pdf, 'rb')
    	process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=self.cvFilePasswd, check_extractable=True)
    	fp.close()
    	device.close()
    	outfp.close()
    	return (0)
Example #4
0
def _pdf_to_text(path):

    try:
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'ascii'
        laparams = LAParams()
        laparams.all_texts = True
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

        with open(path, 'rb') as fp:
            process_pdf(rsrcmgr, device, fp)
            device.close()

            # fix the non-utf8 string ...
            result = retstr.getvalue()
            txt = result.encode('ascii','ignore')

            retVal = (txt,True)
            retstr.close()

    except Exception,e:
        #print str(e)
        #print "\tERROR: PDF is not formatted correctly, aborting."
        retVal = ("", False)
        pass
Example #5
0
    def parse_pdf_pdfminer(self, f, fpath):
        try:
            laparams = LAParams()
            laparams.all_texts = True  
            rsrcmgr = PDFResourceManager()
            pagenos = set()

            if self.dedup:
                self.dedup_store = set()

            self.handler.print_header(fpath)
            page_num = 0
            for page in PDFPage.get_pages(f, pagenos, check_extractable=True):
                page_num += 1

                retstr = StringIO()
                device = TextConverter(rsrcmgr, retstr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                interpreter.process_page(page)
                data = retstr.getvalue()
                retstr.close()

                self.parse_page(fpath, data, page_num)
            self.handler.print_footer(fpath)
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            self.handler.print_error(fpath, e)
Example #6
0
    def do_import(self, results, filepath):
        buff = StringIO()
        fp = open(filepath, "rb")

        laparams = LAParams()
        laparams.all_texts = True
        rsrcmgr = PDFResourceManager()
        pagenos = set()

        page_num = 0
        for page in PDFPage.get_pages(fp, pagenos, check_extractable=True):
            page_num += 1

            device = TextConverter(rsrcmgr,
                                   buff,
                                   codec="utf-8",
                                   laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            interpreter.process_page(page)

            buff.write("\n")

        results.investigation.update(import_text=buff.getvalue())

        fp.close()
        buff.close()
Example #7
0
    def count_words(self):
        """
        Thanks to http://pinkyslemma.com/2013/07/02/word-frequency-from-pdfs/
        and http://www.unixuser.org/~euske/python/pdfminer/programming.html
        """
        with open(self.filename, "rb") as fp:
            rsrcmgr = PDFResourceManager()
            retstr = StringIO()
            codec = 'utf-8'
            laparams = LAParams()
            laparams.all_texts = True
            device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

            parser = PDFParser(fp)
            # Create a PDF document object that stores the document structure.
            # Supply the password for initialization.
            document = PDFDocument(parser)
            # Check if the document allows text extraction. If not, abort.
            if not document.is_extractable:
                raise PDFTextExtractionNotAllowed

            # Create a PDF interpreter object.
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            # Process each page contained in the document.
            for page in PDFPage.create_pages(document):
                interpreter.process_page(page)

            full_text = retstr.getvalue()
            full_text = full_text.translate(string.maketrans("", ""), string.punctuation)

            return len(full_text.split())
Example #8
0
    def dump_pdf_pdfminer(self, fpath_in):
        fpath_out = os.path.splitext(fpath_in)[0] + ".txt"
        n = 0

        with open(fpath_in, 'rb') as fin:
            with open(fpath_out, 'wb') as fout:
                try:
                    laparams = LAParams()
                    laparams.all_texts = True
                    rsrcmgr = PDFResourceManager()
                    pagenos = set()

                    page_num = 0
                    for page in PDFPage.get_pages(fin,
                                                  pagenos,
                                                  check_extractable=True):
                        page_num += 1

                        retstr = StringIO()
                        device = TextConverter(rsrcmgr,
                                               retstr,
                                               laparams=laparams)
                        interpreter = PDFPageInterpreter(rsrcmgr, device)
                        interpreter.process_page(page)
                        data = retstr.getvalue()
                        retstr.close()

                        fout.write(data)
                        n += len(data)
                    print "Written %d bytes to %s" % (n, fpath_out)
                except (KeyboardInterrupt, SystemExit):
                    raise
                except Exception as e:
                    print "Failed parsing %s" % (fpath_in)
Example #9
0
    def parse_pdf_pdfminer(self, f, fpath):
        try:
            laparams = LAParams()
            laparams.all_texts = True
            rsrcmgr = PDFResourceManager()
            pagenos = set()

            self.handler.print_header(fpath)
            page_num = 0

            for page in PDFPage.get_pages(f, pagenos, check_extractable=True):
                page_num += 1

                retstr = StringIO()
                device = TextConverter(rsrcmgr,
                                       retstr,
                                       codec='utf-8',
                                       laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                interpreter.process_page(page)
                data = retstr.getvalue()
                retstr.close()

                self.parse_page(fpath, data, page_num)

            self.handler.print_footer(fpath)
        except (KeyboardInterrupt, SystemExit):
            raise
Example #10
0
def to_text(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    print laparams
    laparams.all_texts = True
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()
    pages = PDFPage.get_pages(fp,
                              pagenos,
                              maxpages=maxpages,
                              password=password,
                              caching=caching,
                              check_extractable=True)
    for page in pages:
        interpreter.process_page(page)
    fp.close()
    device.close()
    str = retstr.getvalue()
    retstr.close()
    return str
Example #11
0
def to_text(path):
    "Wrapper around pdfminer."

    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfpage import PDFPage
    from cStringIO import StringIO

    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    laparams.all_texts = True
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    with open(path, 'rb') as fp:
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()
        pages = PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True)
        for page in pages:
            interpreter.process_page(page)
    device.close()
    str = retstr.getvalue()
    retstr.close()
    return str
Example #12
0
    def parse_pdf_pdfminer(self, f, fpath):
        try:
            laparams = LAParams()
            laparams.all_texts = True  
            rsrcmgr = PDFResourceManager()
            pagenos = set()

            if self.dedup:
                self.dedup_store = set()

            self.handler.print_header(fpath)
            page_num = 0
            parser= PDFParser(f)
            doc = PDFDocument(caching=True)

            parser.set_document(doc)
            doc.set_parser(parser)
            for page in doc.get_pages():
                retstr = StringIO()
                device = TextConverter(rsrcmgr, retstr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                page_num += 1
                interpreter.process_page(page)
                data = retstr.getvalue()
                self.parse_page(fpath, bytes(data,'UTF-8'), page_num)
                retstr.close()
            self.handler.print_footer(fpath)
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            self.handler.print_error(fpath, e)
Example #13
0
def extract_text(in_filename, out_filename):
    text_pdfminer = ''

    laparams = LAParams()
    laparams.all_texts = True
    rsrcmgr = PDFResourceManager()

    page_num = 0
    fp = open(in_filename, 'rb')
    for page in PDFPage.get_pages(fp):
        page_num += 1
        retstr = StringIO()
        device = TextConverter(rsrcmgr,
                               retstr,
                               codec='utf-8',
                               laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        interpreter.process_page(page)
        data = retstr.getvalue()
        retstr.close()

        text_pdfminer += data

    with open(out_filename, 'w') as f:
        print(text_pdfminer, file=f)
Example #14
0
def pdf2str(path):

    #Allocate resources
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    
    #Set parameters
    codec = 'utf-8'
    laparams.all_texts=True
    laparams.detect_vertical = True
    caching = True
    pagenos = set()

    #Initialize the converter
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

    #Open the file and parse
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    for page in PDFPage.get_pages(fp, pagenos,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    #Clean up
    fp.close()
    device.close()
    str = retstr.getvalue()
    retstr.close()
    return str
Example #15
0
def to_text(path):
    """Wrapper around `pdfminer`.

    Parameters
    ----------
    path : str
        path of electronic invoice in PDF

    Returns
    -------
    str : str
        returns extracted text from pdf

    """

    try:
        # python 2
        from StringIO import StringIO
        import sys

        reload(sys)  # noqa: F821
        sys.setdefaultencoding('utf8')
    except ImportError:
        from io import StringIO

    import sys
    sys.path.append("/home/teemo/source/pdfminer/")

    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfpage import PDFPage

    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    laparams.all_texts = True
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    with open(path, 'rb') as fp:
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()
        pages = PDFPage.get_pages(
            fp,
            pagenos,
            maxpages=maxpages,
            password=password,
            caching=caching,
            check_extractable=True,
        )
        for page in pages:
            interpreter.process_page(page)
    device.close()
    str = retstr.getvalue()
    retstr.close()
    return str.encode('utf-8')
Example #16
0
def convert(fname, pages=None, M=1.0, L=0.3, W=0.2, F=0.5):
    """ Converts a pdf filename into plain text.

    Each value is specified not as an actual length, but as a proportion of the length
    to the size of each character in question.

    Parameters define layout analysis. In a PDF text is in several chunks of various types.
    Text extraction needs to recover text chunks which ar regarded as continuous if
    elements distance is closer than the char_margin (identified as M) and thus are
    grouped into one block. Two lines are part of the same text if they are closer than
    the line_margin (L). If the distance between two words is greater than the word_margin (W),
    blank characters (spaces) shall be inserted as necessary to keep format.
    Boxes flow (F) specifies how much a horizontal and vertical position of a text matters
    when determining text flow order. The value should be within the range from -1.0
    (only horizontal position matters) to +1.0 (only vertical position matters).

    Keyword arguments:

      fname -- PDF file name (string)
      pages -- Set of pages to extract (set)
      M -- char_margin (float)
      L -- line_margin (float)
      W -- word_margin (float)
      F -- boxes_flow (float)

    Return:
      text: pdf contents as plain text

    """
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = BytesIO()
    codec = "utf-8"

    manager = PDFResourceManager()
    laparams = LAParams()
    laparams.all_texts = True
    laparams.detect_vertical = False
    laparams.char_margin = M
    laparams.line_margin = L
    laparams.word_margin = W
    laparams.boxes_flow =  F
    converter = TextConverter(manager, output, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close
    return text
Example #17
0
def to_text(path):
    """Wrapper around `pdfminer`.

    Parameters
    ----------
    path : str
        path of electronic invoice in PDF

    Returns
    -------
    str : str
        returns extracted text from pdf

    """

    try:
        # python 2
        from StringIO import StringIO
        import sys

        reload(sys)  # noqa: F821
        sys.setdefaultencoding('utf8')
    except ImportError:
        from io import StringIO

    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams
    from pdfminer.pdfpage import PDFPage

    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    laparams.all_texts = True
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    with open(path, 'rb') as fp:
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()
        pages = PDFPage.get_pages(
            fp,
            pagenos,
            maxpages=maxpages,
            password=password,
            caching=caching,
            check_extractable=True,
        )
        for page in pages:
            interpreter.process_page(page)
    device.close()
    str = retstr.getvalue()
    retstr.close()
    return str.encode('utf-8')
Example #18
0
 def readText(self,path, outtype='text', opts={}):
     outfile = path[:-3] + outtype
     outdir = '/'.join(path.split('/')[:-1])
     # debug option
     pagenos = set()
     maxpages = 0
     # output option
     # ?outfile = None
     # ?outtype = None
     outdir = None
     #layoutmode = 'normal'
     codec = 'utf-8'
     pageno = 1
     scale = 1
     showpageno = True
     laparams = LAParams()
     for (k, v) in opts:
         if k == '-d': debug += 1
         elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
         elif k == '-m': maxpages = int(v)
         elif k == '-P': password = v
         elif k == '-o': outfile = v
         elif k == '-n': laparams = None
         elif k == '-A': laparams.all_texts = True
         elif k == '-V': laparams.detect_vertical = True
         elif k == '-M': laparams.char_margin = float(v)
         elif k == '-L': laparams.line_margin = float(v)
         elif k == '-W': laparams.word_margin = float(v)
         elif k == '-F': laparams.boxes_flow = float(v)
         elif k == '-Y': layoutmode = v
         elif k == '-O': outdir = v
         elif k == '-t': outtype = v
         elif k == '-c': codec = v
         elif k == '-s': scale = float(v)
     print laparams
     #
     #PDFDocument.debug = debug
     #PDFParser.debug = debug
     CMapDB.debug = self.debug
     PDFResourceManager.debug = self.debug
     PDFPageInterpreter.debug = self.debug
     PDFDevice.debug = self.debug
     #
     rsrcmgr = PDFResourceManager()
     #outtype = 'text'
     outfp = StringIO()
     device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
     fp = file(path, 'rb')
     process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, check_extractable=True)
     fp.close()
     device.close()
     print outfp.getvalue()
     outfp.close()
     return
Example #19
0
def convert_to_text_file(filename_in, filename_out, rewrite=False):
    """
        Parse file according to BORME PDF format

        filename:
        filenameOut:
    """

    if os.path.isdir(filename_out):
        filename_out = os.path.join(filename_out, os.path.basename(filename_in))

    if os.path.exists(filename_out) and not rewrite:
        logging.info('Skipping file %s already exists and rewriting is disabled!' % filename_out)
        return False

    # conf
    codec = 'utf-8'
    laparams = LAParams()
    imagewriter = None
    pagenos = set()
    maxpages = 0
    password = ''
    rotation = 0

    # <LAParams: char_margin=2.0, line_margin=0.5, word_margin=0.1 all_texts=False>
    laparams.detect_vertical = True
    laparams.all_texts = False
    laparams.char_margin = 2.0
    laparams.line_margin = 0.5
    laparams.word_margin = 0.1

    caching = True
    rsrcmgr = PDFResourceManager(caching=caching)
    outfp = open(filename_out, 'w')
    device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter)
    fp = open(filename_in, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # https://github.com/euske/pdfminer/issues/72
    #page = PDFPage()
    #PDFPage.cropbox =

    # y esto?
    for page in PDFPage.get_pages(fp, pagenos,
                                  maxpages=maxpages, password=password,
                                  caching=caching, check_extractable=True):
        page.rotate = (page.rotate + rotation) % 360
        interpreter.process_page(page)

    fp.close()
    device.close()
    outfp.close()
    return True
Example #20
0
 def to_text(self):
     rsrcmgr = PDFResourceManager()
     output = StringIO()
     laparams = LAParams()
     laparams.detect_vertical = True
     laparams.all_texts = True
     laparams.word_margin = 0.4
     device = TextConverter(rsrcmgr, output, laparams=laparams)
     interpreter = PDFPageInterpreter(rsrcmgr, device)
     for page in self._doc.get_pages():
             interpreter.process_page(page)
     return output.getvalue().decode('utf-8', 'ignore')
Example #21
0
    def get_text(self):
        """Returns all text content from the PDF as plain text.
        """
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        laparams.all_texts = True
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

        try:
            file_pointer = file(self.path, 'rb')
            process_pdf(rsrcmgr, device, file_pointer)
        except Exception as e:
            logging.error("Error processing PDF: %s" % e)
            raise
        finally:
            file_pointer.close()
            device.close()

        text = retstr.getvalue()
        retstr.close()
        if (text is None) or (text.strip() == ""):
            logging.info("No text found in PDF. Attempting OCR. This will take a while.")
            #FIXME this should go in a separate method
            #First, convert to image
            import subprocess
            try:
                arglist = ["gs",
                      "-dNOPAUSE",
                      "-sOutputFile=temp/page%03d.png",
                      "-sDEVICE=png16m",
                      "-r72",
                      self.path]
                process = subprocess.call(
                    args=arglist,
                    stdout=subprocess.STDOUT,
                    stderr=subprocess.STDOUT)
            except OSError:
                logging.error("Failed to run GhostScript (using `gs`)")
            #Do OCR
            import time
            time.sleep(1) # make sure the server has time to write the files
            import Image
            import pytesseract
            import os
            text = ""
            for file_ in os.listdir("temp"):
                if file_.endswith(".png"):
                    text += pytesseract.image_to_string(Image.open("temp/" + file_), lang="swe")
                    os.unlink("temp/" + file_)
        self.text = text
        return text
Example #22
0
def convert_pdf_to_string(file_path):
    output_string = StringIO()
    laparams = LAParams()
    laparams.all_texts = True
    with open(file_path, 'rb') as in_file:
        parser = PDFParser(in_file)
        doc = PDFDocument(parser)
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.create_pages(doc):
            interpreter.process_page(page)

    return (output_string.getvalue())
Example #23
0
    def _pdf2text(self,fp):
        try:
            rsrcmgr = PDFResourceManager()
            retstr = StringIO()
            codec = 'ascii'
            laparams = LAParams()
            laparams.all_texts = True
            device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

            process_pdf(rsrcmgr, device, fp)
            device.close()

            # fix the non-utf8 string ...
            result = retstr.getvalue()
            txt = result.encode('ascii','ignore')

            # TODO: clean this up, I feel like I'm doing the converstion twice ...
            # http://stackoverflow.com/a/16503222/2154772
            parser = PDFParser(fp)
            doc = PDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
            doc.initialize()
            #print doc.info[0]['CreationDate'].resolve()
            
            #
            # as messed up as this is ... CreationDate isn't always the same type as it
            # comes back from the PDFParser, so we need to base it on an instance of a
            # basestring or not.
            #
            created = ""
            try:
                if not isinstance(doc.info[0]['CreationDate'],basestring):
                    creatd = doc.info[0]['CreationDate'].resolve()[2:-7]
                else:
                    created = doc.info[0]['CreationDate'][2:-7]
            except:
                self._report("CreationDate field could not be decoded within PDF, setting to ''")
                pass
            created = created.encode('ascii','ignore')
            retVal = (created,txt,True)
            retstr.close()
        except Exception, e:
            self._report("Error: \n\t%s" % str(e))
            retVal = (None,"",False)
            pass
Example #24
0
    def _pdf2text(self,fp):
        try:
            rsrcmgr = PDFResourceManager()
            retstr = StringIO()
            codec = 'ascii'
            laparams = LAParams()
            laparams.all_texts = True
            device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

            process_pdf(rsrcmgr, device, fp)
            device.close()

            # fix the non-utf8 string ...
            result = retstr.getvalue()
            txt = result.encode('ascii','ignore')

            # TODO: clean this up, I feel like I'm doing the converstion twice ...
            # http://stackoverflow.com/a/16503222/2154772
            parser = PDFParser(fp)
            doc = PDFDocument()
            parser.set_document(doc)
            doc.set_parser(parser)
            doc.initialize()
            #print doc.info[0]['CreationDate'].resolve()
            
            #
            # as messed up as this is ... CreationDate isn't always the same type as it
            # comes back from the PDFParser, so we need to base it on an instance of a
            # basestring or not.  I'm starting to dislike PDFs ...
            #
            if not isinstance(doc.info[0]['CreationDate'],basestring):
                datestring = doc.info[0]['CreationDate'].resolve()[2:-7]
            else:
                datestring = doc.info[0]['CreationDate'][2:-7]
            #print "working on '{0}'...".format(datestring)
            ts = strptime(datestring, "%Y%m%d%H%M%S")
            created = datetime.fromtimestamp(mktime(ts))

            retVal = (created,txt,True)
            retstr.close()
        except Exception, e:
            self._reportstr("Error: \n\t%s" %str(e))
            retVal = (None,"",False)
            pass
def getPdfAsText(pdfPages = None, fileDescriptor = None):
    if pdfPages is None and fileDescriptor is not None:
        pdfPages = getPdfPages(fileDescriptor)

    resourceManager = PDFResourceManager()
    laparams = LAParams()
    laparams.all_texts = True
    laparams.detect_vertical = True

    try:
        outputStream = StringIO.StringIO()
        device = TextConverter(resourceManager, outputStream, laparams=laparams)
        intrepreter = PDFPageInterpreter(resourceManager, device)
        for pdfPage in pdfPages:
            intrepreter.process_page(pdfPage)
        return outputStream.getvalue()
    finally:
        device.close()
        outputStream.close()
Example #26
0
def read_pdf(fp, password='', *page_numbers):
    # Create a PDF parser object associated with the file object.
    parser = PDFParser(fp)
    # Create a PDF document object that stores the document structure.
    # Supply the password for initialization.
    document = PDFDocument(parser, password)
    # Check if the document allows text extraction. If not, abort.
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    rsrcmgr = PDFResourceManager(caching=True)
    laparams = LAParams()
    laparams.all_texts = False
    device = TextAnalyzer(rsrcmgr, sys.stdout, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # Process each page contained in the document.
    for page in PDFPage.create_pages(document):
        interpreter.process_page(page)
    device.close()
    return device.get_result()
Example #27
0
    def __init__(self, pdffile):
        """Create the PDF Document object

        Reads a PDF file and turns it into a text string and extracts
        some document info

        """
        self.scores = {}
        laparams = LAParams()
        laparams.all_texts = True
        sio = StringIO()

        fp = open(pdffile, "rb")
        # Create a PDF document object that stores the document structure.
        # Supply the password for initialization.
        document = PDFDocument(PDFParser(fp))

        # Check if the document allows text extraction. If not, abort.
        if not document.is_extractable:
            raise PDFTextExtractionNotAllowed
        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()
        # Create a PDF device object.
        device = PDFDevice(rsrcmgr)
        device = TextConverter(rsrcmgr, sio, laparams=laparams)

        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.
        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)

        self.pdffile_text = sio.getvalue()
        self.info = document.info
        # 20190915234815+02'00'
        self.creation_date = datetime.strptime(
            str(self.info[0]["CreationDate"]).split("+")[0].split(":")[1],
            "%Y%m%d%H%M%S",
        )
        fp.close()
        device.close()
        sio.close()
Example #28
0
def pdf(f):
    rsrcmgr = PDFResourceManager()
    retstr = cStringIO.StringIO()
    codec = 'utf-8'

    laparams = LAParams()
    laparams.all_texts = True

    device = TextConverter(
        rsrcmgr, retstr, codec=codec, laparams=laparams
    )

    fp = file(f, 'rb')
    process_pdf(rsrcmgr, device, fp)
    fp.close()
    device.close()

    str = retstr.getvalue()
    retstr.close()
    return str
Example #29
0
def to_text(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    print laparams
    laparams.all_texts = True
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    str = retstr.getvalue()
    retstr.close()
    return str
Example #30
0
def pdfcn():
    laparams = LAParams()
    laparams.all_texts = True
    ###从之前程序崩溃的地方重新启动,查找转换的pdf
    filelist4 = []
    finallist = []
    path2 = r'D:\dataset\acl10_12_txt'
    filelist2 = os.listdir(path2)
    path3 = r'D:\dataset\acl10_12s'
    filelist3 = os.listdir(path3)
    for i in filelist2:
        filelist4.append(i[:-4])
    print filelist4
    for filename in filelist3:
        #print filename[:-4]
        if filename[:-4] not in filelist4:
            finallist.append(filename[:-4])
            #print finallist


    #path = r'D:\dataset\aclpdf2'
    #filelist = os.listdir(path)
    for pdf in finallist:
        try:
            outfile = "D:\\dataset\\acl10_12_txt\\"+pdf+".txt"
            codec = 'utf-8'
            args = [path3+'\\'+pdf+'.pdf']
            rsrc = PDFResourceManager()
            outfp = file(outfile, 'w')
            device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
            for fname in args:
                fp = file(fname, 'rb')
                process_pdf(rsrc, device, fp, None, maxpages=0, password='')
                print '%s finishing ' % pdf
                fp.close()
        except:
            continue
    device.close()
    outfp.close()
Example #31
0
def parse_pdf(path):

    fd = open(path, 'rb')
    retstr = StringIO()

    laparams = LAParams()
    laparams.all_texts = True
    laparams.detect_vertical = True
    rmngr = PDFResourceManager(caching=True)
    device = MyTextConverter(rmngr,
                             retstr,
                             laparams=laparams,
                             imagewriter=None)
    interpreter = PDFPageInterpreter(rmngr, device)
    for page in PDFPage.get_pages(fd, set(), check_extractable=True):
        interpreter.process_page(page)
    fulltext = (''.join(device.text_output)).strip()
    fd.close()
    if len(fulltext) == 0:
        return []
    lines = fulltext.split("\n")

    return lines
Example #32
0
def convert_pdf(target_fn):
    ''' Convert a pdf file into a string of text '''
    laparams = LAParams()
    laparams.all_texts = True
    laparams.detect_vertical = True

    resource_manager = PDFResourceManager(caching=True)
    output_fh = StringIO.StringIO()
    device = TextConverter(resource_manager,
                           output_fh,
                           codec='utf-8',
                           laparams=laparams,
                           imagewriter=None)
    interpreter = PDFPageInterpreter(resource_manager, device)

    with open(target_fn, 'rb') as f:
        for page in PDFPage.get_pages(f):
            interpreter.process_page(page)

    device.close()
    output_fh.seek(0)
    content = output_fh.read().decode('utf-8')
    return content
Example #33
0
def to_text(path):
    """Wrapper around pdfminer. Returns whole text as first value, pdf
    layouts with corresponding pages as second"""
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    laparams.all_texts = False
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    deviceLayout = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreterLayout = PDFPageInterpreter(rsrcmgr, deviceLayout)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()
    pages = PDFPage.get_pages(fp,
                              pagenos,
                              maxpages=maxpages,
                              password=password,
                              caching=caching,
                              check_extractable=True)
    objects = []
    for page_n, page in enumerate(pages):
        interpreter.process_page(page)
        interpreterLayout.process_page(page)
        layout = deviceLayout.get_result()
        objects.append((content_from_layout(layout), page_n))

    fp.close()
    device.close()
    deviceLayout.close()
    string = retstr.getvalue()
    retstr.close()
    return string, objects
Example #34
0
def main(argv=None):
    parser = argparse.ArgumentParser(description='Convert PDF into text.')
    parser.add_argument('file',
                        nargs='*',
                        type=argparse.FileType('rb'),
                        default=sys.stdin,
                        help='file(s) to convert')
    parser.add_argument('-C',
                        '--nocache',
                        dest='cache',
                        action='store_false',
                        help='prevent object caching (slower)')
    parser.add_argument('-l',
                        metavar='level',
                        default='warn',
                        help='logging level (warn, info, debug)')
    parser.add_argument('-p',
                        metavar='page',
                        nargs='+',
                        default=[],
                        type=int,
                        help='page number(s) (space separated)')
    parser.add_argument('-m',
                        metavar='maxpages',
                        default=0,
                        type=int,
                        help='maximum number of pages to extract')
    parser.add_argument('-P',
                        metavar='password',
                        default='',
                        help='pdf password')
    parser.add_argument('-o',
                        metavar='outfile',
                        type=argparse.FileType('w'),
                        default=sys.stdout,
                        help='output file name (default: stdout)')
    parser.add_argument('-O',
                        metavar='directory',
                        type=ImageWriter,
                        help='extract images and save to directory')
    parser.add_argument('-t',
                        metavar='outtype',
                        help='output type (text, html, xml, tag)')
    parser.add_argument('-c',
                        metavar='codec',
                        default='utf-8',
                        help='output text encoding (default: %(default)s)')
    lagroup = parser.add_argument_group(title='layout analysis')
    lagroup.add_argument('-n',
                         action='store_true',
                         help='disable layout analysis')
    lagroup.add_argument('-A',
                         action='store_true',
                         help='force layout analysis on all text')
    lagroup.add_argument('-V',
                         action='store_true',
                         help='detect vertical text')
    lagroup.add_argument('-M',
                         metavar='char_margin',
                         type=float,
                         help='custom character margin')
    lagroup.add_argument('-L',
                         metavar='line_margin',
                         type=float,
                         help='custom line margin')
    lagroup.add_argument('-W',
                         metavar='word_margin',
                         type=float,
                         help='custom word margin')
    lagroup.add_argument('-F',
                         metavar='boxes_flow',
                         type=float,
                         help='custom boxes flow')
    lagroup.add_argument('-Y',
                         metavar='layout_mode',
                         default='normal',
                         help='layout mode for HTML (normal, exact, loose)')
    lagroup.add_argument('-s',
                         metavar='scale',
                         default=1,
                         type=float,
                         help='output scaling for HTML')
    args = parser.parse_args(argv)

    logging.basicConfig()
    logging.getLogger('pdfminer').setLevel(args.l.upper())

    laparams = LAParams()
    if args.n:
        laparams = None
    else:
        laparams.all_texts = args.A
        laparams.detect_vertical = args.V
        if args.M:
            laparams.char_margin = args.M
        if args.L:
            laparams.line_margin = args.L
        if args.W:
            laparams.word_margin = args.W
        if args.F:
            laparams.boxes_flow = args.F

    rsrcmgr = PDFResourceManager(caching=args.cache)
    outtype = args.t
    if not outtype:
        if args.o:
            if args.o.name.endswith('.htm') or args.o.name.endswith('.html'):
                outtype = 'html'
            elif args.o.name.endswith('.xml'):
                outtype = 'xml'
            elif args.o.name.endswith('.tag'):
                outtype = 'tag'
    if outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              args.o,
                              codec=args.c,
                              laparams=laparams,
                              imagewriter=args.O)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               args.o,
                               codec=args.c,
                               scale=args.s,
                               layoutmode=args.Y,
                               laparams=laparams,
                               imagewriter=args.O)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, args.o, codec=args.c)
    else:
        device = TextConverter(rsrcmgr,
                               args.o,
                               codec=args.c,
                               laparams=laparams,
                               imagewriter=args.O)
    for fp in args.file:
        process_pdf(rsrcmgr,
                    device,
                    fp, [i - 1 for i in args.p],
                    maxpages=args.m,
                    password=args.P,
                    caching=args.cache,
                    check_extractable=True)
        fp.close()
    device.close()
    if args.o is not sys.stdout:
        args.o.close()
Example #35
0
    #parser = PDFParser(open_file)
    # Create a PDF document object that stores the document structure.
    #doc = PDFDocument(parser)
    # Connect the parser and document objects.
    #print parser.nextline()
    #print parser.nextline()
    #print parser.nextline()


    ##ATTEMPT 2
    #Code from pdf2txt.py
    laparams = LAParams()
    laparams.char_margin = 2.0
    laparams.line_margin=0.5
    laparams.word_margin=0.1
    laparams.all_texts=False

    rsrcmgr = PDFResourceManager()
    device = TextConverter(rsrcmgr, fp_out, codec='utf-8', laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pdf_pages = PDFPage.get_pages(fp_in, set())
    pagenum = 0
    pagelim = 3
    for page in pdf_pages:
        pagenum += 1
        if pagenum > pagelim:
            continue
        print "Transcribing page " + str(pagenum) + " from PDF to text"
        interpreter.process_page(page)
    fp_in.close()
    fp_out.close()
Example #36
0
def main(argv):

    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = False
    laparams = LAParams()
    using_optparse = False

    parser = ArgumentParser(prog='pdf2txt.py',
                            description='Convert pdf to txt',
                            formatter_class=ArgumentDefaultsHelpFormatter)

    if using_optparse:
        DEBUG(3, 'using optparse')
        parser.add_argument = parser.add_option
        parser.parse_known_args = parser.parse_args
        parser.disable_interspersed_args()

    parser.add_argument('-d',
                        dest='debuglevel',
                        action='count',
                        default=0,
                        help='Debug (repeat for more verbose debugging)')

    parser.add_argument(
        '-p',
        '--pages',
        dest='pagenos',
        action='store',
        type=str,
        default='',
        help=
        'Specifies the comma-separated list of the page numbers to be extracted. Page numbers start at one. By default, it extracts text from all the pages.'
    )

    parser.add_argument('-c',
                        '--codec',
                        dest='codec',
                        action='store',
                        type=str,
                        default='utf-8',
                        help='Specifies the output codec.')

    parser.add_argument(
        '-t',
        '--type',
        dest='outtype',
        action='store',
        type=str,
        default='shape',
        choices=['text', 'html', 'xml', 'tag', 'shape'],
        help='Specifies the output format, one of: shape, text, html, xml, tag'
    )

    parser.add_argument(
        '-m',
        dest='maxpages',
        action='store',
        type=int,
        default=0,
        help=
        'Specifies the maximum number of pages to extract. By default (0), it extracts all the pages in a document.'
    )

    parser.add_argument(
        '-P',
        '--password',
        dest='password',
        action='store',
        type=str,
        default='',
        help='Provides the user password to access PDF contents.')

    parser.add_argument(
        '-o',
        '--output',
        dest='outfile',
        action='store',
        type=str,
        default=None,
        help=
        'Specifies the output file name. By default, it prints the extracted contents to stdout in text format.'
    )

    parser.add_argument(
        '-C',
        '--no-caching',
        dest='caching',
        action='store_false',
        default=True,
        help=
        'Suppress object caching. This will reduce the memory consumption but also slows down the process.'
    )

    parser.add_argument('-n',
                        '--no-layout',
                        dest='layout',
                        action='store_false',
                        default=True,
                        help='Suppress layout analysis.')

    parser.add_argument('--show-pageno',
                        dest='show_pageno',
                        action='store_true',
                        default=False,
                        help='Show page numbers.')

    parser.add_argument(
        '-A',
        '--analyze-all',
        dest='all_texts',
        action='store_true',
        default=False,
        help=
        'Forces to perform layout analysis for all the text strings, including text contained in figures.'
    )

    parser.add_argument('-V',
                        '--detect-vertical',
                        dest='detect_vertical',
                        action='store_true',
                        default=False,
                        help='Allows vertical writing detection.')

    parser.add_argument(
        '-M',
        dest='char_margin',
        action='store',
        type=float,
        default=2.0,
        help=
        'Two text chunks whose distance is closer than the char_margin (shown as M) is considered continuous and get grouped into one.'
    )

    parser.add_argument(
        '-L',
        dest='line_margin',
        action='store',
        type=float,
        default=0.5,
        help=
        'Two lines whose distance is closer than the line_margin (L) is grouped as a text box, which is a rectangular area that contains a "cluster" of text portions.'
    )

    parser.add_argument(
        '-W',
        dest='word_margin',
        action='store',
        type=float,
        default=0.1,
        help=
        'It may be required to insert blank characters (spaces) as necessary if the distance between two words is greater than the word_margin (W), as a blank between words might not be represented as a space, but indicated by the positioning of each word.'
    )

    parser.add_argument(
        '-F',
        dest='boxes_flow',
        action='store',
        type=float,
        default=0.5,
        help=
        'Specifies how much a horizontal and vertical position of a text matters when determining a text order. The value should be within the range of -1.0 (only horizontal position matters) to +1.0 (only vertical position matters).'
    )

    parser.add_argument(
        '-Y',
        '--layout-mode',
        dest='layoutmode',
        action='store',
        type=str,
        default='normal',
        choices=['exact', 'normal', 'loose'],
        help=
        'Specifies how the page layout should be preserved. (Currently only applies to HTML format.) One of: exact, normal, loose.'
    )

    parser.add_argument('-O',
                        '--image-writer',
                        dest='imagewriter',
                        action='store',
                        type=str,
                        default=None,
                        help='imagewriter')

    parser.add_argument('-R',
                        '--rotation',
                        dest='rotation',
                        action='store',
                        type=int,
                        default=0,
                        help='rotation')

    parser.add_argument('-S',
                        '--strip-control',
                        dest='stripcontrol',
                        action='store_true',
                        default=False,
                        help='stripcontrol')

    parser.add_argument(
        '-s',
        dest='scale',
        action='store',
        type=float,
        default=1,
        help='Specifies the output scale. Can be used in HTML format only.')

    parser.add_argument(
        '--draw-lines',
        dest='draw_lines',
        action='store_true',
        help=
        "Draw crude page representation, coloured TextLines (= short pieces of text). Valid only for the `shape' output."
    )

    parser.add_argument(
        '--draw-boxes',
        dest='draw_boxes',
        action='store_true',
        help=
        "Draw crude page representation, coloured TextBoxes (= grouped text lines). Valid only for the `shape' output."
    )

    parser.add_argument(
        '--draw-blocks',
        dest='draw_blocks',
        action='store_true',
        help=
        "Draw crude page representation, coloured TextBlocks (= grouped TextBoxes). Valid only for the `shape' output."
    )

    parser.add_argument(
        '--shear-limit',
        dest='shear_limit',
        action='store',
        default=0.1,
        type=float,
        help=
        "If the text is sheared above this limit, reject it. Valid only for the `shape' output."
    )

    parser.add_argument(
        '--rotation-limit',
        dest='rotation_limit',
        action='store',
        default=2,
        type=float,
        help=
        "If the text is rotated above this angle (in degrees), reject it. Valid only for the `shape' output."
    )

    parser.add_argument(
        '--line-height-diff',
        dest='line_height_diff',
        action='store',
        type=float,
        default=0.1,
        help=
        'Two lines whose vertical sizes differ more than this ratio are not to be considered of the same paragraph (but e.g. one of them is a heading).'
    )

    parser.add_argument('--heading-before',
                        dest='heading_before',
                        action='store',
                        type=str,
                        default='',
                        help='String to put before each heading, e.g. <h1>')

    parser.add_argument('--heading-after',
                        dest='heading_after',
                        action='store',
                        type=str,
                        default='',
                        help='String to put after each heading, e.g. </h1>')

    parser.add_argument(
        '--box-separator',
        dest='box_separator',
        action='store',
        type=str,
        default=r'\n\n',
        help=
        r'Separate boxes with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.'
    )

    parser.add_argument(
        '--block-separator',
        dest='block_separator',
        action='store',
        type=str,
        default=r'\n\n',
        help=
        r'Separate blocks with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.'
    )

    parser.add_argument(
        '--indent-separator',
        dest='indent_separator',
        action='store',
        type=str,
        default=r'\n\n',
        help=
        r'Separate indented lines with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.'
    )

    parser.add_argument(
        '--indent-string',
        dest='indent_string',
        action='store',
        type=str,
        default=r'\t',
        help=
        r'Put this string in front of indented lines. Use \n for new line, \t for TAB, other escape sequences are not recognized.'
    )

    parser.add_argument(
        '--indent-limit',
        dest='indent_limit',
        action='store',
        type=float,
        default=3,
        help=
        'If the line is indented more then this (approximately characters), it will separated by --indent-separator from the previous one.'
    )

    parser.add_argument(
        '--page-separator',
        dest='page_separator',
        action='store',
        type=str,
        default=r'\n\n',
        help=
        r'Separate pages with this string. Use \n for new line, \t for TAB, other escape sequences are not recognized.'
    )

    parser.add_argument(
        '--norm-whitespace',
        dest='norm_whitespace',
        action='store_true',
        default=False,
        help=
        'Normalize whitespace (remove duplicate spaces, replace end of lines with spaces).'
    )

    parser.add_argument(
        '--print-stats',
        dest='print_stats',
        action='store_true',
        default=False,
        help=
        'Instead of the text, output some simple statistics about the file.')

    parser.add_argument(
        '--max-blocks',
        dest='max_blocks',
        action='store',
        default=0,
        type=int,
        help=
        'If there is more than this blocks per page, do not return any text. Use to discriminate abnormal files (run --print-stats first to find out the number of boxes per "normal" file). 0 means no limit. 50 is maybe a good value.'
    )

    parser.add_argument(
        '--max-textlines',
        dest='max_textlines',
        action='store',
        default=0,
        type=int,
        help=
        'If there is more than this textlines per any block, do not return any text. Use to discriminate abnormal files (run --print-stats first to find out the number of boxes per "normal" page). 0 means no limit. 18 is maybe a good value.'
    )

    parser.add_argument(
        '--line-height-method',
        dest='line_height_method',
        action='store',
        type=str,
        default='bbox',
        choices=['bbox', 'mean', 'median'],
        help=
        'Method to calculate height of line (relevant if there are characters with uneven height). bbox takes the bounding box (rectangle encompassing the line), mean the arithmetic mean of the height of all the characters, median is the median of the height of all the characters. Use mean or median if there are outlier characters, e.g. one big character at the beginning of line.'
    )

    parser.add_argument(dest='pdffile',
                        help='List of PDF files to go through',
                        default=None,
                        nargs='+')

    args, rest = parser.parse_known_args()

    global debuglevel
    debuglevel = debug = args.debuglevel
    DEBUG(3, 'args:', str(args))
    DEBUG(3, 'rest:', str(rest))

    DEBUG(3, 'optparse:', using_optparse)

    if args.pagenos:
        pagenos.update(int(x) - 1 for x in args.pagenos.split(','))
    maxpages = args.maxpages
    outfile = args.outfile
    password = args.password
    caching = args.caching
    showpageno = args.show_pageno
    if not args.layout:
        laparams = None
    if laparams and args.all_texts:
        laparams.all_texts = True
    if laparams and args.detect_vertical:
        laparams.detect_vertical = True
    if laparams:
        laparams.char_margin = args.char_margin
        laparams.line_margin = args.line_margin
        laparams.word_margin = args.word_margin
        laparams.boxes_flow = args.boxes_flow
    layoutmode = args.layoutmode

    if args.imagewriter:
        imagewriter = ImageWriter(args.imagewriter)

    rotation = args.rotation
    stripcontrol = args.stripcontrol
    outtype = args.outtype
    codec = args.codec
    scale = args.scale

    args.box_separator = unescape_string(args.box_separator)
    args.block_separator = unescape_string(args.block_separator)
    args.indent_separator = unescape_string(args.indent_separator)
    args.indent_string = unescape_string(args.indent_string)

    args.page_separator = unescape_string(args.page_separator)

    global options
    options = args

    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug

    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
        DEBUG(2, 'output goes to', outfile)
    else:
        outfp = sys.stdout
        DEBUG(2, 'output goes to stdout')
    if outtype == 'shape':
        device = ShapeTextConverter(rsrcmgr,
                                    outfp,
                                    codec=codec,
                                    laparams=laparams,
                                    showpageno=showpageno,
                                    imagewriter=imagewriter)
    elif outtype == 'text':
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              codec=codec,
                              laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=stripcontrol)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter,
                               debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in options.pdffile:
        DEBUG(2, 'processing', fname)
        fp = file(fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp,
                                      pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            page.rotate = (page.rotate + rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()

    outfp.close()
    DEBUG(2, 'finished.')

    return
def readPDF2HTML(pdfFile, opts={}):
    # open a PDF file
    fp = StringIO(pdfFile.read())
    retstr = StringIO()
    # create a PDF parser object associated with the file object
    parser = PDFParser(fp)
    # create a PDF document allows text extraction
    document = PDFDocument(parser) # password if needed
    # check if document allows text extraction without password
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    # create a PDF resource manager object that sotres shared resources
    rsrcmgr = PDFResourceManager()
    # create a PDF device object
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d':
            debug += 1
        elif k == '-p':
            pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m':
            maxpages = int(v)
        elif k == '-P':
            password = v
        elif k == '-o':
            outfile = v
        elif k == '-n':
            laparams = None
        elif k == '-A':
            laparams.all_texts = True
        elif k == '-V':
            laparams.detect_vertical = True
        elif k == '-M':
            laparams.char_margin = float(v)
        elif k == '-L':
            laparams.line_margin = float(v)
        elif k == '-W':
            laparams.word_margin = float(v)
        elif k == '-F':
            laparams.boxes_flow = float(v)
        elif k == '-Y':
            layoutmode = v
        elif k == '-O':
            outdir = v
        elif k == '-t':
            outtype = v
        elif k == '-c':
            codec = v
        elif k == '-s':
            scale = float(v)
    codec = 'utf-8'
    device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    # create a PDF interpreter object
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pagenos = set()
    # process each page contained in the document
    for page in PDFPage.get_pages(fp, pagenos):
        interpreter.process_page(page)
    # close streams and return text content
    fp.close()
    content = retstr.getvalue()
    device.close()
    retstr.close()
    return content
Example #38
0
def main(argv):
    import getopt
    def usage():
        print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
               ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
               ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]'
               ' [-t text|html|xml|tag] [-c codec] [-s scale]'
               ' file ...' % argv[0])
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-R': rotation = int(v)
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                              imagewriter=imagewriter)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                               layoutmode=layoutmode, laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file(fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp, pagenos,
                                      maxpages=maxpages, password=password,
                                      caching=caching, check_extractable=True):
            page.rotate = (page.rotate+rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()
    outfp.close()
    return
Example #39
0
def parsepdf_pdfminer_formal(path, outtype='txt'):
    # debug option
    debug = 0
    # input option
    password = b''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = r'C:\Users\Administrator\Desktop\parseRes_demo.' + outtype
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    encoding = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    laparams.all_texts = True
    laparams.detect_vertical = True
    # for (k, v) in opts:
    #     if k == '-d': debug += 1
    #     elif k == '-P': password = v.encode('ascii')
    #     elif k == '-o': outfile = v
    #     elif k == '-t': outtype = v
    #     elif k == '-O': imagewriter = ImageWriter(v)
    #     elif k == '-c': encoding = v
    #     elif k == '-s': scale = float(v)
    #     elif k == '-R': rotation = int(v)
    #     elif k == '-Y': layoutmode = v
    #     elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
    #     elif k == '-m': maxpages = int(v)
    #     elif k == '-S': stripcontrol = True
    #     elif k == '-C': caching = False
    #     elif k == '-n': laparams = None
    #     elif k == '-A': laparams.all_texts = True
    #     elif k == '-V': laparams.detect_vertical = True
    #     elif k == '-M': laparams.char_margin = float(v)
    #     elif k == '-W': laparams.word_margin = float(v)
    #     elif k == '-L': laparams.line_margin = float(v)
    #     elif k == '-F': laparams.boxes_flow = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = open(outfile, 'w', encoding=encoding)
    if outtype == 'txt':
        device = TextConverter(rsrcmgr, outfp, laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=stripcontrol)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, scale=scale,
                               layoutmode=layoutmode, laparams=laparams,
                               imagewriter=imagewriter, debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp)
    # else:
    #     return usage()
    # for fname in args:
    with open(path, 'rb') as fp:
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp, pagenos,
                                        maxpages=maxpages, password=password,
                                        caching=caching, check_extractable=True):
            page.rotate = (page.rotate+rotation) % 360
            interpreter.process_page(page)
    device.close()
    outfp.close()
    return
Example #40
0
def main(argv):
    import getopt

    def usage():
        print(
            'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
            ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
            ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]'
            ' [-t text|html|xml|tag] [-c codec] [-s scale]'
            ' file ...' % argv[0])
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:],
                                     'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-R': rotation = int(v)
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              codec=codec,
                              laparams=laparams,
                              imagewriter=imagewriter)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file(fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp,
                                      pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            page.rotate = (page.rotate + rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()
    outfp.close()
    return
Example #41
0
def main(argv):
    import getopt			#getopt 模块,它的功能是 获取执行命令行时附带的参数,关于getopt模块详细可参照http://www.16kan.com/post/207647.html
    def usage(): 			#usage() 函数,用于在用户输入错误命令或者命令输入不规范时,输出py文件的使用范例。当参数不足或错误时,usage()被调用
        print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
               '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
               '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
		'''
		getopt函数的格式是getopt.getopt ( [命令行参数列表], "短选项", [长选项列表] )
		短选项名后的冒号(:)表示该选项必须有附加的参数。p,m,P,o,M,L,W,F,Y,O,t,c,s均为必须参数
		长选项名后的等号(=)表示该选项必须有附加的参数。
		返回opts和args。
		'''
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''			#参数P
    pagenos = set()			#参数p
    maxpages = 0			#参数m
    # output option
    outfile = None			#参数o output
    outtype = None			#参数t out type
    outdir = None			#参数O output directory
    layoutmode = 'normal'	#参数Y
    codec = 'utf-8'			#参数c
    pageno = 1				
    scale = 1				#参数s,暂缺M,L,F,Y四个参数
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': outdir = v
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:				#确认输出文件格式
        outtype = 'text'
        if outfile:			
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:					
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)  #TextConverter貌似不能指定outdir参数
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                               layoutmode=layoutmode, laparams=laparams, outdir=outdir)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file(fname, 'rb')
        process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
                    caching=caching, check_extractable=True)
        fp.close()
    device.close()
    outfp.close()
    return
Example #42
0
def main(argv):
    import getopt

    def usage():
        print(
            f'usage: {argv[0]} [-P password] [-o output] [-t text|html|xml|tag]'
            ' [-O output_dir] [-c encoding] [-s scale] [-R rotation]'
            ' [-Y normal|loose|exact] [-p pagenos] [-m maxpages]'
            ' [-S] [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin]'
            ' [-W word_margin] [-F boxes_flow] [-d] input.pdf ...')
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:],
                                     'dP:o:t:O:c:s:R:Y:p:m:SCnAVM:W:L:F:')
    except getopt.GetoptError:
        return usage()
    if not args:
        return usage()
    # debug option
    debug = 0
    # input option
    password = b''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    encoding = 'utf-8'
    # pageno = 1
    scale = 1
    caching = True
    # showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d':
            debug += 1
        elif k == '-P':
            password = v.encode('ascii')
        elif k == '-o':
            outfile = v
        elif k == '-t':
            outtype = v
        elif k == '-O':
            imagewriter = ImageWriter(v)
        elif k == '-c':
            encoding = v
        elif k == '-s':
            scale = float(v)
        elif k == '-R':
            rotation = int(v)
        elif k == '-Y':
            layoutmode = v
        elif k == '-p':
            pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m':
            maxpages = int(v)
        elif k == '-S':
            stripcontrol = True
        elif k == '-C':
            caching = False
        elif k == '-n':
            laparams = None
        elif k == '-A':
            laparams.all_texts = True
        elif k == '-V':
            laparams.detect_vertical = True
        elif k == '-M':
            laparams.char_margin = float(v)
        elif k == '-W':
            laparams.word_margin = float(v)
        elif k == '-L':
            laparams.line_margin = float(v)
        elif k == '-F':
            laparams.boxes_flow = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = open(outfile, 'w', encoding=encoding)
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr,
                               outfp,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=stripcontrol)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter,
                               debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp)
    else:
        return usage()
    for fname in args:
        with open(fname, 'rb') as fp:
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.get_pages(fp,
                                          pagenos,
                                          maxpages=maxpages,
                                          password=password,
                                          caching=caching,
                                          check_extractable=True):
                page.rotate = (page.rotate + rotation) % 360
                interpreter.process_page(page)
    device.close()
    outfp.close()

    bad_words = [
        'Personal', 'Information', 'Projects', 'Internship', 'Technologies'
    ]
    with open('cv.txt') as oldfile, open('cv_new.txt', 'w') as newfile:
        for line in oldfile:
            if not any(bad_word in line for bad_word in bad_words):
                newfile.write(line)

    file = open("cv_new.txt", "r")
    s = file.read()
    s = s.split('\n')

    while ("" in s):
        s.remove("")
    while (" " in s):
        s.remove(" ")
    while ("\x0c" in s):
        s.remove("\x0c")

    details = []
    i = 0
    while (i < len(s)):
        s1 = s[i].split(': ')
        if (len(s1) > 1):
            details.append(s1[1])
        i += 1

    sql = "INSERT INTO entries (name, post, exp) VALUES (%s, %s, %s)"
    val = (details[0], details[1], details[2])
    mycursor.execute(sql, val)
    mydb.commit()
    print(mycursor.rowcount, "record inserted.")
    return
Example #43
0
def main(argv):
    import getopt
    def usage():
        print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] '
               '[-n] [-A] [-D writing_mode] [-M char_margin] [-L line_margin] [-W word_margin] '
               '[-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAD:M:L:W:O:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    outdir = None
    codec = 'utf-8'
    pageno = 1
    scale = 1
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-D': laparams.writing_mode = v
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-O': outdir = v
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFDocument.debug = debug
    PDFParser.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrc = PDFResourceManager()
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
    elif outtype == 'xml':
        device = XMLConverter(rsrc, outfp, codec=codec, laparams=laparams, outdir=outdir)
    elif outtype == 'html':
        device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir)
    elif outtype == 'tag':
        device = TagExtractor(rsrc, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file(fname, 'rb')
        process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages, password=password)
        fp.close()
    device.close()
    outfp.close()
    return
Example #44
0
def pdf2txt(argv):
    import getopt
    (opts, args) = getopt.getopt(argv[0:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    outdir = None
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': outdir = v
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              codec=codec,
                              laparams=laparams,
                              outdir=outdir)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               outdir=outdir)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file(fname, 'rb')
        process_pdf(rsrcmgr,
                    device,
                    fp,
                    pagenos,
                    maxpages=maxpages,
                    password=password,
                    caching=caching,
                    check_extractable=True)
        fp.close()
    device.close()
    outfp.close()
    return
Example #45
0
def main(argv):
    import getopt

    def usage():
        print(
            'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] '
            '[-n] [-A] [-D writing_mode] [-M char_margin] [-L line_margin] [-W word_margin] '
            '[-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...'
            % argv[0])
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAD:M:L:W:O:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    outdir = None
    codec = 'utf-8'
    pageno = 1
    scale = 1
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-D': laparams.writing_mode = v
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-O': outdir = v
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFDocument.debug = debug
    PDFParser.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrc = PDFResourceManager()
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
    elif outtype == 'xml':
        device = XMLConverter(rsrc,
                              outfp,
                              codec=codec,
                              laparams=laparams,
                              outdir=outdir)
    elif outtype == 'html':
        device = HTMLConverter(rsrc,
                               outfp,
                               codec=codec,
                               scale=scale,
                               laparams=laparams,
                               outdir=outdir)
    elif outtype == 'tag':
        device = TagExtractor(rsrc, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file(fname, 'rb')
        process_pdf(rsrc,
                    device,
                    fp,
                    pagenos,
                    maxpages=maxpages,
                    password=password)
        fp.close()
    device.close()
    outfp.close()
    return
Example #46
0
def main(argv):
    import getopt
    def usage():
        print(f'usage: {argv[0]} [-P password] [-o output] [-t text|html|xml|tag]'
               ' [-O output_dir] [-c encoding] [-s scale] [-R rotation]'
               ' [-Y normal|loose|exact] [-p pagenos] [-m maxpages]'
               ' [-S] [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin]'
               ' [-W word_margin] [-F boxes_flow] [-d] input.pdf ...')
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'dP:o:t:O:c:s:R:Y:p:m:SCnAVM:W:L:F:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = b''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    encoding = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-P': password = v.encode('ascii')
        elif k == '-o': outfile = v
        elif k == '-t': outtype = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-c': encoding = v
        elif k == '-s': scale = float(v)
        elif k == '-R': rotation = int(v)
        elif k == '-Y': layoutmode = v
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-S': stripcontrol = True
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        if sys.platform == 'linux':
            outfp = open(outfile, 'w', encoding=encoding)
        elif sys.platform == 'win32':
            outfp = open(outfile, 'wb')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=stripcontrol)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, scale=scale,
                               layoutmode=layoutmode, laparams=laparams,
                               imagewriter=imagewriter, debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp)
    else:
        return usage()
    for fname in args:
        with open(fname, 'rb') as fp:
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.get_pages(fp, pagenos,
                                          maxpages=maxpages, password=password,
                                          caching=caching, check_extractable=True):
                page.rotate = (page.rotate+rotation) % 360
                interpreter.process_page(page)
    device.close()
    outfp.close()
    return
def convert_pdf_To_Txt(path,opts={}):
    """
    this ALGO form pdfinterp modul  documentation


    """

        # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    retstr = StringIO()
    if outtype == 'text':
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams,
                               imagewriter=imagewriter)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp, pagenos,
                                      maxpages=maxpages, password=password,
                                      caching=caching, check_extractable=True):
        interpreter.process_page(page)
    #print retstr.getvalue()
    txt2Pdf=retstr.getvalue()
    #print type(txt2Pdf)

    #fp.close()
    #device.close()
    #outfp.close()
    return txt2Pdf
Example #48
0
def ConvertPdf(pdfpath, outfp, opts={}):
    import sys
    from pdfminer.pdfdocument import PDFDocument
    from pdfminer.pdfparser import PDFParser
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.pdfdevice import PDFDevice, TagExtractor
    from pdfminer.pdfpage import PDFPage
    from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
    from pdfminer.cmapdb import CMapDB
    from pdfminer.layout import LAParams
    from pdfminer.image import ImageWriter

    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-R': rotation = int(v)
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFDocument.debug = debug
    PDFParser.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager()
    if not outtype:
        outtype = 'txt'
    if outtype == 'txt':
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              codec=codec,
                              laparams=laparams,
                              imagewriter=imagewriter)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    fp = file(pdfpath, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        page.rotate = (page.rotate + rotation) % 360
        interpreter.process_page(page)
    fp.close()
    device.close()

    return True
Example #49
0
def main(argv=None):
    parser = argparse.ArgumentParser(description='Convert PDF into text.')
    parser.add_argument('file', nargs='*', type=argparse.FileType('rb'), default=sys.stdin, help='file(s) to convert')
    parser.add_argument('-C', '--nocache', dest='cache', action='store_false', help='prevent object caching (slower)')
    parser.add_argument('-l', metavar='level', default='warn', help='logging level (warn, info, debug)')
    parser.add_argument('-p', metavar='page', nargs='+', default=[], type=int, help='page number(s) (space separated)')
    parser.add_argument('-m', metavar='maxpages', default=0, type=int, help='maximum number of pages to extract')
    parser.add_argument('-P', metavar='password', default='', help='pdf password')
    parser.add_argument('-o', metavar='outfile', type=argparse.FileType('w'), default=sys.stdout,
                        help='output file name (default: stdout)')
    parser.add_argument('-O', metavar='directory', type=ImageWriter, help='extract images and save to directory')
    parser.add_argument('-t', metavar='outtype', help='output type (text, html, xml, tag)')
    parser.add_argument('-c', metavar='codec', default='utf-8', help='output text encoding (default: %(default)s)')
    lagroup = parser.add_argument_group(title='layout analysis')
    lagroup.add_argument('-n', action='store_true', help='disable layout analysis')
    lagroup.add_argument('-A', action='store_true', help='force layout analysis on all text')
    lagroup.add_argument('-V', action='store_true', help='detect vertical text')
    lagroup.add_argument('-M', metavar='char_margin', type=float, help='custom character margin')
    lagroup.add_argument('-L', metavar='line_margin', type=float, help='custom line margin')
    lagroup.add_argument('-W', metavar='word_margin', type=float, help='custom word margin')
    lagroup.add_argument('-F', metavar='boxes_flow', type=float, help='custom boxes flow')
    lagroup.add_argument('-Y', metavar='layout_mode', default='normal', help='layout mode for HTML (normal, exact, loose)')
    lagroup.add_argument('-s', metavar='scale', default=1, type=float, help='output scaling for HTML')
    args = parser.parse_args(argv)

    logging.basicConfig()
    logging.getLogger('pdfminer').setLevel(args.l.upper())

    laparams = LAParams()
    if args.n:
        laparams = None
    else:
        laparams.all_texts = args.A
        laparams.detect_vertical = args.V
        if args.M:
            laparams.char_margin = args.M
        if args.L:
            laparams.line_margin = args.L
        if args.W:
            laparams.word_margin = args.W
        if args.F:
            laparams.boxes_flow = args.F

    rsrcmgr = PDFResourceManager(caching=args.cache)
    outtype = args.t
    if not outtype:
        if args.o:
            if args.o.name.endswith('.htm') or args.o.name.endswith('.html'):
                outtype = 'html'
            elif args.o.name.endswith('.xml'):
                outtype = 'xml'
            elif args.o.name.endswith('.tag'):
                outtype = 'tag'
    if outtype == 'xml':
        device = XMLConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, args.o, codec=args.c, scale=args.s, layoutmode=args.Y,
                               laparams=laparams, imagewriter=args.O)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, args.o, codec=args.c)
    else:
        device = TextConverter(rsrcmgr, args.o, codec=args.c, laparams=laparams, imagewriter=args.O)
    for fp in args.file:
        process_pdf(rsrcmgr, device, fp, [i-1 for i in args.p], maxpages=args.m, password=args.P,
                    caching=args.cache, check_extractable=True)
        fp.close()
    device.close()
    if args.o is not sys.stdout:
        args.o.close()
Example #50
0
def main(argv):
    import getopt

    def usage():
        print(
            "usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] "
            "[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] "
            "[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ..." % argv[0]
        )
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:], "dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:")
    except getopt.GetoptError:
        return usage()
    if not args:
        return usage()
    # debug option
    debug = 0
    # input option
    password = ""
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    outdir = None
    layoutmode = "normal"
    codec = "utf-8"
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == "-d":
            debug += 1
        elif k == "-p":
            pagenos.update(int(x) - 1 for x in v.split(","))
        elif k == "-m":
            maxpages = int(v)
        elif k == "-P":
            password = v
        elif k == "-o":
            outfile = v
        elif k == "-C":
            caching = False
        elif k == "-n":
            laparams = None
        elif k == "-A":
            laparams.all_texts = True
        elif k == "-V":
            laparams.detect_vertical = True
        elif k == "-M":
            laparams.char_margin = float(v)
        elif k == "-L":
            laparams.line_margin = float(v)
        elif k == "-W":
            laparams.word_margin = float(v)
        elif k == "-F":
            laparams.boxes_flow = float(v)
        elif k == "-Y":
            layoutmode = v
        elif k == "-O":
            outdir = v
        elif k == "-t":
            outtype = v
        elif k == "-c":
            codec = v
        elif k == "-s":
            scale = float(v)
    #
    # PDFDocument.debug = debug
    # PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = "text"
        if outfile:
            if outfile.endswith(".htm") or outfile.endswith(".html"):
                outtype = "html"
            elif outfile.endswith(".xml"):
                outtype = "xml"
            elif outfile.endswith(".tag"):
                outtype = "tag"
    if outfile:
        outfp = file(outfile, "w")
    else:
        outfp = sys.stdout
    if outtype == "text":
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    elif outtype == "xml":
        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
    elif outtype == "html":
        device = HTMLConverter(
            rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, outdir=outdir
        )
    elif outtype == "tag":
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file(fname, "rb")
        process_pdf(
            rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True
        )
        fp.close()
    device.close()
    outfp.close()
    return
def main(files=None):
    if files is None:
        files = get_datafiles()
    # debug option level
    debug = 0
    # input option
    password = ''
    pagenos = set()
    # pagenos.update( int(x)-1 for x in v.split(',') )
    maxpages = 0
    # output option
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    rsrcmgr = PDFResourceManager(caching=caching)
    showpageno = True

    # Line Agumentation ? Parameters
    laparams = LAParams()
    laparams.all_texts = True
    laparams.detect_vertical = True
    laparams.line_overlap = 0.3  # Line overlap
    laparams.char_margin = 2.0  # Letter Spacing
    laparams.line_margin = 0.5  # Line Spacing
    laparams.word_margin = 0.1  # Word spacing
    laparams.boxes_flow = 0.5  # +-1.0  how much hor vs. vertical matters
    # position maters for line continuation
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug
    #

    for fname in files:
        fname = str(fname)
        imagedir = os.path.abspath(os.path.join(os.path.dirname(fname), 'img'))
        # print(imagedir)
        imagewriter = None
        imagewriter = ImageWriter(imagedir)  # output folder for images
        name = os.path.splitext(os.path.basename(fname))[0]
        print(name)
        outfile = fname[:-4] + '.txt'
        device = TextCon(rsrcmgr,
                         laparams=laparams,
                         imagewriter=imagewriter,
                         imagename=name)

        interpreter = PDFPageInterpreter(rsrcmgr, device)

        fp = file(fname, 'rb')
        try:
            for page in PDFPage.get_pages(fp,
                                          pagenos,
                                          maxpages=maxpages,
                                          password=password,
                                          caching=caching,
                                          check_extractable=True):
                page.rotate = (page.rotate + rotation) % 360
                interpreter.process_page(page)
        except:
            continue

        rows = [list(row) for row in device.rows]

        pages = max([row[0] for row in rows])
        max_y = max([row[4] for row in rows])
        min_y = min([row[2] for row in rows])

        list_0 = [int(row[4]) for row in rows]
        list_1 = []
        [
            list_1.append(obj) for obj in list_0
            if obj not in list_1 and list_0.count(obj) > pages - 1
        ]
        max_y2 = max(list_1)

        list_0 = [int(row[2]) for row in rows]
        list_1 = []
        [
            list_1.append(obj) for obj in list_0
            if obj not in list_1 and list_0.count(obj) > pages - 1
        ]
        min_y2 = min(list_1)

        print('max_ys:', max_y - max_y2)
        print('min_ys:', min_y - min_y2)

        # Get max and min the hard way because of stupid headers
        list_0 = [int(row[3]) for row in rows]
        list_1 = []
        [
            list_1.append(obj) for obj in list_0
            if obj not in list_1 and list_0.count(obj) > 10
        ]
        if list_1:
            max_x = max(list_1)
        else:
            max_x = max([int(row[3]) for row in device.rows])

        list_0 = [int(row[1]) for row in rows]
        list_1 = []
        [
            list_1.append(obj) for obj in list_0
            if obj not in list_1 and list_0.count(obj) > 10
        ]
        if list_1:
            min_x = min(list_1)
        else:
            min_x = min([int(row[3]) for row in device.rows])
        # Errors if more pics on one side then other
        # mid_x = (sum([(float(row[1]) + float(row[3]))/2 for row in
        #    device.rows])/len(device.rows))
        mid_x = (max_x + min_x) / 2
        # mid_x = 595/2  # center of A4 at 72px/in Letter would be 612/2
        l_height = sum([row[4] - row[2] for row in rows]) / len(rows)

        # print('max_x:', max_x)
        # print('min_x:', min_x)
        # print('mid_x:', mid_x)
        print('l_height:', l_height)

        column2 = []
        lines = []
        pagenumber = 0
        table_caps = ['\n']
        table_data = []
        table = False

        for i, row in enumerate(rows):
            #l_height = row[4]-row[2]
            l_space = rows[i - 1][2] - row[4]

            #print(l_height, l_space, rows[i-1][2], rows[i][4], str(row[5]))
            if row[0] == pagenumber + 1:
                lines += column2
                column2 = []
                pagenumber += 1

            if row[0] == pagenumber:
                if (max_y - min_y) * 0.95 > l_space > 0.8 * l_height:
                    # capture Table (assuming tables will span all columns)
                    if re.match(r"^table", str(row[5]), re.I):
                        table = True
                        table_caps.append(str(row[5]))
                        table_data.append('\n')
                        table_data.append(str(row[5]))
                        table_data.append('\n')
                        continue
                    else:
                        table = False

                # capture table captions multi lines
                elif (table_caps[-1] == str(rows[i - 1][5])
                      and -2 * l_height < l_space < 0.5 * l_height):
                    table_caps[-1] += str(row[5])
                    table_data[-2] += str(row[5])
                    continue

                if table:
                    # capture table data
                    if int(rows[i - 1][2]) == int(rows[i][2]):
                        table_data[-1] += '\t' + str(row[5])
                        continue
                    else:
                        table_data.append(str(row[5]))
                        continue

                elif int(row[1]) > mid_x and ((int(rows[i - 1][1]) < mid_x and
                                               int(rows[i - 1][3]) < mid_x) or
                                              (int(rows[i - 1][1]) > mid_x
                                               and int(rows[i - 1][3]) > mid_x)
                                              or rows[i - 1][3] > max_x * 0.9
                                              or l_space > 2.5 * l_height):
                    """
                        r_space > c_space or
                        previous[3] > max_x * 0.9 or
                        l_space > 2 * l_height):"""
                    if len(column2) > 0:
                        if 1 > (row[2] - column2[-1][2]) > -1:
                            # join if on same line
                            if int(row[1]) < int(column2[-1][1]):
                                column2[-1][5] = row[5] + " " + column2[-1][5]
                            else:
                                column2[-1][5] = column2[-1][5] + " " + row[5]
                        else:
                            column2.append(row)
                    else:
                        column2.append(row)
                    # print(2, str(row[5]))
                else:
                    if len(lines) > 0:
                        if 1 > (row[2] - lines[-1][2]) > -1:
                            # join if on same line
                            if int(row[1]) < int(lines[-1][1]):
                                lines[-1][5] = row[5] + " " + lines[-1][5]
                            else:
                                lines[-1][5] = lines[-1][5] + " " + row[5]
                        else:
                            lines.append(row)
                    else:
                        lines.append(row)
                    # print(3, str(row[5]))
        # add final column
        lines += column2

        fig_caps = ['\n']
        headers = ['\n']
        footers = ['\n']
        supp_info = ['\n']
        new_lines = []
        supp_re = re.compile(
            r"Corresponding author|Electronic mail|email"
            "|E-mail|^doi|doi:|^keywords|^pacs|^apc", re.I)

        for i, line in enumerate(lines):
            #l_height = lines[i][4]-lines[i][2]
            l_space = lines[i - 1][2] - lines[i][4]
            l_space_below = 0
            l_space_2below = 0
            if i + 1 < len(lines):
                l_space_below = lines[i][2] - lines[i + 1][4]
            if i + 2 < len(lines):
                l_space_2below = lines[i + 1][2] - lines[i + 2][4]
            fig = fig_caps[-1]
            print(l_space, l_space_below, l_space_2below, lines[i][2],
                  lines[i][4], str(line[5]))

            # capture figure captions multi lines
            if (fig_caps[-1] == str(lines[i - 1][5])
                    and -2 * l_height < l_space < 0.5 * l_height):
                fig_caps.append(str(line[5]))
                continue
            # capture headers (up to two lines)
            if (lines[i][2] > max_y * 0.95
                    and (l_space_below > 0.5 * l_height
                         or l_space_2below > 0.5 * l_height)):
                headers.append('\n')
                headers.append(str(line[5]))
                if supp_re.search(str(line[5])):
                    headers.append('\n')
                    headers.append(str(line[5]))
                else:
                    continue
            # capture supporting info
            if supp_re.search(str(line[5])):
                print(str(line[5]))
                supp_info.append('\n')
                supp_info.append(str(line[5]))
                continue
            if (max_y - min_y) * 0.95 > l_space > 0.5 * l_height:
                # capture figure captions
                if re.match(r"^fig", str(line[5]), re.I):
                    fig_caps.append('\n')
                    fig_caps.append(str(line[5]))
                    continue
                # capture footers
                elif lines[i][2] < min_y + max_y * 0.015:
                    footers.append('\n')
                    footers.append(str(line[5]))
                    continue
                else:
                    string = str(lines[i - 1][5])

                    if (any(string in s for s in fig_caps)
                            or any(string in s for s in headers)):  # or
                        #string == footers[-1] or string == supp_info[-1]):
                        pass
                    else:
                        new_lines.append('\n')
            new_lines.append(str(line[5]))

        with open(outfile, 'w') as f:
            f.write(' '.join(new_lines))
            f.write('\n\nFigures')
            f.write(' '.join(fig_caps))
            f.write('\n\nTables')
            #f.write(' '.join(table_caps))
            f.write('\n'.join(table_data))
            f.write('\n\nHeaders')
            f.write(' '.join(headers))
            f.write('\n\nFooters')
            f.write(' '.join(footers))
            f.write('\n\nSupporting Info')
            f.write(' '.join(supp_info))

    # the histogram of the data
    # n, bins, patches = plt.hist(x_data, 50)
    # plt.show()

    device.close()
    print('Done')
    return
Example #52
0
def main(fname, k, v):

    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()

    if k == '-d':
        debug += 1
    elif k == '-p':
        pagenos.update( int(x)-1 for x in v.split(',') )
    elif k == '-m':
        maxpages = int(v)
    elif k == '-P':
        password = v
    elif k == '-o':
        outfile = v
    elif k == '-C':
        caching = False
    elif k == '-n':
        laparams = None
    elif k == '-A':
        laparams.all_texts = True
    elif k == '-V':
        laparams.detect_vertical = True
    elif k == '-M':
        laparams.char_margin = float(v)
    elif k == '-L':
        laparams.line_margin = float(v)
    elif k == '-W':
        laparams.word_margin = float(v)
    elif k == '-F':
        laparams.boxes_flow = float(v)
    elif k == '-Y':
        layoutmode = v
    elif k == '-O':
        imagewriter = ImageWriter(v)
    elif k == '-R':
        rotation = int(v)
    elif k == '-S':
        stripcontrol = True
    elif k == '-t':
        outtype = v
    elif k == '-c':
        codec = v
    elif k == '-s':
        scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=stripcontrol)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                               layoutmode=layoutmode, laparams=laparams,
                               imagewriter=imagewriter, debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
 
    fp = file(fname, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp, pagenos,
                                  maxpages=maxpages, password=password,
                                  caching=caching, check_extractable=True):
        page.rotate = (page.rotate+rotation) % 360
        interpreter.process_page(page)
    fp.close()
    device.close()
    outfp.close()
    return
Example #53
0
def main(argv):
    import getopt
    def usage():
        print 'Syntax:\npdf2htm.exe SourcePDF\n where the parameter is either a file name or\na wildcard spec like\n*.pdf\nEnclose it with quotes if it contains a space\n\nAdditional options are supported with named command line parameters as follows:'
        print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
               ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
               ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]'
               ' [-t text|html|xml|tag] [-c codec] [-s scale]'
               ' file ...' % argv[0])
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = 'tag'
    imagewriter = None
    rotation = 0
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = False
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-R': rotation = int(v)
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'tag'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout

    for fname in args:
        l = glob.glob(fname)
        count = len(l)
        print 'Converting ' + str(count) + ' from ' + fname + ' to ' + outtype + ' format'
        for pdf in l:
#             print pdf
            d = {'html' : 'htm', 'tag' : 'tag', 'text' : 'txt', 'xml' : 'xml'}
            ext = '.' + d[outtype]
            outfile = pdf[0:-4] + ext
            print outfile
            outfp = file(outfile, 'wb')
            if outtype == 'text':
                device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                                       imagewriter=imagewriter)
                device.showpageno = False
            elif outtype == 'xml':
                device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                                      imagewriter=imagewriter)
                device.showpageno = False
            elif outtype == 'html':
                device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                                       layoutmode=layoutmode, laparams=laparams,
                                       imagewriter=imagewriter)
                device.showpageno = False
            elif outtype == 'tag':
                device = TagExtractor(rsrcmgr, outfp, codec=codec)
                device.showpageno = False
            else:
                return usage()
    
            fp = file(pdf, 'rb')
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.get_pages(fp, pagenos,
                                          maxpages=maxpages, password=password,
                                          caching=caching, check_extractable=True):
                page.rotate = (page.rotate+rotation) % 360
                interpreter.process_page(page)
            fp.close()
            device.close()
            outfp.close()

        print 'Done'
    return
Example #54
0
def main(argv):
    import getopt

    def usage():
        print(
            "usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]"
            " [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]"
            " [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation] [-S]"
            " [-t text|html|xml|tag] [-c codec] [-s scale]"
            " file ..." % argv[0]
        )
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:], "dp:m:P:o:CnAVM:L:W:F:Y:O:R:St:c:s:")
    except getopt.GetoptError:
        return usage()
    if not args:
        return usage()
    # input option
    password = b""
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = "normal"
    codec = "utf-8"
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == "-d":
            logging.getLogger().setLevel(logging.DEBUG)
        elif k == "-p":
            pagenos.update(int(x) - 1 for x in v.split(","))
        elif k == "-m":
            maxpages = int(v)
        elif k == "-P":
            password = v
        elif k == "-o":
            outfile = v
        elif k == "-C":
            caching = False
        elif k == "-n":
            laparams = None
        elif k == "-A":
            laparams.all_texts = True
        elif k == "-V":
            laparams.detect_vertical = True
        elif k == "-M":
            laparams.char_margin = float(v)
        elif k == "-L":
            laparams.line_margin = float(v)
        elif k == "-W":
            laparams.word_margin = float(v)
        elif k == "-F":
            laparams.boxes_flow = float(v)
        elif k == "-Y":
            layoutmode = v
        elif k == "-O":
            imagewriter = ImageWriter(v)
        elif k == "-R":
            rotation = int(v)
        elif k == "-S":
            stripcontrol = True
        elif k == "-t":
            outtype = v
        elif k == "-c":
            codec = v
        elif k == "-s":
            scale = float(v)
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = "text"
        if outfile:
            if outfile.endswith(".htm") or outfile.endswith(".html"):
                outtype = "html"
            elif outfile.endswith(".xml"):
                outtype = "xml"
            elif outfile.endswith(".tag"):
                outtype = "tag"
    if outfile:
        outfp = open(outfile, "wb")
    else:
        outfp = sys.stdout
        if outfp.encoding is not None:
            codec = None
    if outtype == "text":
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter)
    elif outtype == "xml":
        device = XMLConverter(
            rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter, stripcontrol=stripcontrol
        )
    elif outtype == "html":
        device = HTMLConverter(
            rsrcmgr, outfp, codec=codec, scale=scale, layoutmode=layoutmode, laparams=laparams, imagewriter=imagewriter
        )
    elif outtype == "tag":
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = open(fname, "rb")
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(
            fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True
        ):
            page.rotate = (page.rotate + rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()
    outfp.close()
    return
Example #55
0
def main(argv):
    import getopt
    def usage():
        print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] '
               '[-n] [-A] [-D writing_mode] [-M char_margin] [-L line_margin] [-W word_margin] '
               '[-c codec] file ...' % argv[0])
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:nAD:M:L:W:c:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    codec = 'utf-8'
    pageno = 1
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-D': laparams.writing_mode = v
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-c': codec = v
    #
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFDocument.debug = debug
    PDFParser.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager()
   
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout

    device = CourseRegisterParser(rsrcmgr, outfp, codec=codec, laparams=laparams)
    
    for fname in args:
        fp = file(fname, 'rb')
        process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password)
        fp.close()
    device.close()
    outfp.close()
    return
Example #56
0
def convert_pdf(path, outtype='txt', opts={}):
    outfile = path[:-3] + outtype
    outdir = '/'.join(path.split('/')[:-1])

    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    codec = 'utf-8'
    pageno = 1
    scale = 1
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-D': laparams.writing_mode = v
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-O': outdir = v
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFDocument.debug = debug
    PDFParser.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager()
    if not outtype:
        outtype = 'txt'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'txt':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale, laparams=laparams, outdir=outdir)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()

    fp = file(path, 'rb')
    process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password)
    fp.close()
    device.close()

    outfp.close()
    return
Example #57
0
def main(argv):
    def usage():
        print(('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
               '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
               '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0]))
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    debug = False
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    outdir = None
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug = True
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': outdir = v
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    
    if debug:
        set_debug_logging()
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore')
        close_outfp = True
    else:
        outfp = sys.stdout
        close_outfp = False
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, laparams=laparams)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, laparams=laparams, outdir=outdir)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode,
            laparams=laparams, outdir=outdir, debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp)
    else:
        return usage()
    for fname in args:
        fp = io.open(fname, 'rb')
        process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
                    caching=caching, check_extractable=True)
        fp.close()
    device.close()
    if close_outfp:
        outfp.close()
Example #58
0
def main(argv):
    def usage():
        print((
            'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
            '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
            '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...'
            % argv[0]))
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:],
                                     'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    debug = False
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    outdir = None
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug = True
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': outdir = v
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)

    if debug:
        set_debug_logging()
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore')
        close_outfp = True
    else:
        outfp = sys.stdout
        close_outfp = False
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, laparams=laparams)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, laparams=laparams, outdir=outdir)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               outdir=outdir,
                               debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp)
    else:
        return usage()
    for fname in args:
        fp = io.open(fname, 'rb')
        process_pdf(rsrcmgr,
                    device,
                    fp,
                    pagenos,
                    maxpages=maxpages,
                    password=password,
                    caching=caching,
                    check_extractable=True)
        fp.close()
    device.close()
    if close_outfp:
        outfp.close()
Example #59
0
    def document(self):
        def mergeSameParagraphLines(lines):
            def isEndOfParagraph(line):
                return line[-1:] in ['.', '?', '!'] or len(line) < 60

            result = []
            currentLine = ''

            for line in lines:
                #				print "# '" + line + "'"
                currentLine += line
                if isEndOfParagraph(line):
                    result.append(currentLine)
                    currentLine = ''

            if currentLine != '':
                result.append(currentLine)

            return result

        if not self._document:
            pdfFile = open(self._pdfDocument, 'rb')
            pdfParser = PDFParser(pdfFile)
            document = PDFDocument()

            pdfParser.set_document(document)
            document.set_parser(pdfParser)
            document.initialize()

            if not document.is_extractable:
                raise pdfminer.pdfparser.PDFTextExtractionNotAllowed

            resourceManger = PDFResourceManager()

            debug = 1
            #
            PDFDocument.debug = debug
            PDFParser.debug = debug
            #			CMapDB.debug = debug
            PDFResourceManager.debug = debug
            PDFPageInterpreter.debug = debug
            PDFDevice.debug = debug
            #

            pdfContent = StringIO()
            laparams = LAParams()
            laparams.all_texts = True
            laparams.detect_vertical = True
            #			laparams.line_margin = 1.0
            #			laparams.char_margin = 1.0
            #			laparams.word_margin = 1.0
            #			laparams.boxes_flow = 1.0

            #			device = PDFDevice(resourceManger)
            device = TextConverter(resourceManger,
                                   pdfContent,
                                   codec='utf-8',
                                   laparams=laparams)
            interpreter = PDFPageInterpreter(resourceManger, device)
            for page in document.get_pages():
                interpreter.process_page(page)
            content = mergeSameParagraphLines(
                pdfContent.getvalue().split('\n'))

            toc = []
            try:
                for (level, title, destination, a,
                     se) in document.get_outlines():
                    toc.append((level, title))
            except:
                pass

            pdfContent.close()

            self._document = Document().initWithDocumentInfo(
                content, None, None)

        return self._document