Esempi in Python per HTMLConverter, esempi in Python per pdfminer.converter.HTMLConverter

Esempio n. 1

0

Mostra file

File: extract_pdfs.py Progetto: AllieDeford/radremedy

def process_pdf(in_path, out_path):
    """
    Processes a PDF and extracts its contents to HTML.

    Args:
        in_path: The full path to the source PDF file.
        out_path: The full path to the destination HTML file.
    """
    page_numbers=set()

    # Get source/destination file handles
    in_file = file(in_path, 'rb')
    out_file = file(out_path, 'w')

    # Set up the resource manager, device, and interpreter
    res_mgr = PDFResourceManager()
    device = HTMLConverter(res_mgr, out_file, codec='utf-8', laparams=LAParams(), imagewriter=None)
    interpreter = PDFPageInterpreter(res_mgr, device)

    for page in PDFPage.get_pages(in_file, page_numbers, 
            maxpages=0, password="", 
            caching=True, check_extractable=True):
        interpreter.process_page(page)

    # Close all the file handles
    in_file.close()
    device.close()
    out_file.close()
    return

Esempio n. 2

0

Mostra file

File: scraper.py Progetto: tcrwt/whatsforcaff2

def pdf_to_html(scraped_pdf_data): 
    from pdfminer.pdfinterp import PDFResourceManager, process_pdf 
    from pdfminer.pdfdevice import PDFDevice 
    from pdfminer.converter import HTMLConverter 
    from pdfminer.layout import LAParams 

    import StringIO 
    fp = StringIO.StringIO() 
    fp.write(scraped_pdf_data) 
    fp.seek(0) 
    outfp = StringIO.StringIO() 
    layoutmode='normal'
    scale=2
    charmargin=0.5
    linemargin=0.5
    wordmargin=0.3
    boxesflow=0

    rsrcmgr = PDFResourceManager() 
    device = HTMLConverter(rsrcmgr, outfp, layoutmode=layoutmode, scale=scale, laparams=LAParams(char_margin=charmargin, line_margin=linemargin, word_margin=wordmargin, boxes_flow=boxesflow)) 
    process_pdf(rsrcmgr, device, fp) 
    device.close() 

    t = outfp.getvalue() 
    outfp.close() 
    fp.close() 
    return t

Esempio n. 3

0

Mostra file

File: Utilitarios.py Progetto: ClaudioSiervi/ETL_PDF

    def pdf_para_html(self, path):
        from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
        from pdfminer.converter import HTMLConverter
#        from pdfminer.converter import TextConverter
        from pdfminer.layout import LAParams
        from pdfminer.pdfpage import PDFPage
        from cStringIO import StringIO
#        import re
#        import csv
        
        
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        fp = file(path, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0 #is for all
        caching = True
        pagenos=set()
        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
            interpreter.process_page(page)
        fp.close()
        device.close()
        str = retstr.getvalue()
        retstr.close()
        return str

Esempio n. 4

0

Mostra file

File: openFile.py Progetto: afrocircus/LVFX-pipeline

def convertPDF(fname, pages=None):
    if not pages:
        pagenos = set()
    else:
        pagenos = set(pages)
    caching = True
    outfp = StringIO()
    layoutmode = 'normal'
    laparams = LAParams()
    rotation = 0

    rsrcmgr = PDFResourceManager(caching=caching)
    device = HTMLConverter(rsrcmgr, outfp, codec='utf-8', scale=1,
                           layoutmode=layoutmode, laparams=laparams,
                           imagewriter=None)
    fp = file(fname, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp, pagenos,
                                  maxpages=0, password='',
                                  caching=caching, check_extractable=True):
        page.rotate = (page.rotate+rotation) % 360
        interpreter.process_page(page)
    fp.close()
    device.close()

    text = outfp.getvalue()
    outfp.close()
    return text

Esempio n. 5

0

Mostra file

File: ipp_menu.py Progetto: timtammittee/ipp_menu

def parse_html(file_name):
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    imagewriter = None
    rotation = 0
    codec = 'utf-8'
    caching = True
    laparams = LAParams()
    rsrcmgr = PDFResourceManager(caching=caching)
    outfp = TextReciver()
    device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                               imagewriter=imagewriter)

    for fname in [file_name]:
        fp = file(fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp, pagenos,
                                      maxpages=maxpages, password=password,
                                      caching=caching, check_extractable=True):
            page.rotate = (page.rotate+rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()
    return outfp.text

Esempio n. 6

0

Mostra file

File: pdfminer_wrapper.py Progetto: neoyukito/Confopy

 def to_html(self, fp):
     out_buf = StringIO.StringIO()
     device = HTMLConverter( self.resmgr
                           , out_buf
                           , codec=self.options.codec
                           , scale=self.options.scale
                           , layoutmode=self.options.layoutmode
                           , laparams=self.options.laparams
                           , outdir=None
                           )
     self._process(fp, device)
     device.close()
     result = out_buf.getvalue()
     out_buf.close()
     return result

Esempio n. 7

0

Mostra file

File: pdfminer_transform.py Progetto: devcon14/cmis-capture

    def transform_file(self, pdfpath):
        try:
            self.LOGGER.debug(pdfpath)
            rsrcmgr = PDFResourceManager()
            retstr = StringIO()
            codec = 'utf-8'

            device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=self.laparams)
            fp = file(pdfpath, 'rb')
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            password = ""
            maxpages = 0
            caching = True
            pagenos = set()
            # NOTE check_extractable seems to allow overriding text extraction locks
            for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=False):
                interpreter.process_page(page)
            fp.close()
            device.close()
            html = retstr.getvalue()
            # otherwise html is str at this point, not unicode
            html = html.decode('utf8')
            retstr.close()
            soup = BeautifulSoup(html)
            # LOGGER.debug(soup.text)
            text_size = len(soup.text)
            stub_data = {
                # "URL": uri,
                "markup": {
                    "innerHTML": unicode(html),
                    "innerText": unicode(soup.text)
                },
                "workflow": {
                    "is_stub": True
                },
                "__text_size": text_size,
                # __fields are ignored by kibana
                "timestamp": datetime.now()
            }
        except Exception as e:
            stub_data = {
                "error": str(e),
                "workflow": {
                    "is_stub": True
                },
                "__text_size": -1
            }
        return stub_data

Esempio n. 8

0

Mostra file

File: pdfhandler.py Progetto: OAPDF/oapdftools

	def __init__(self):
		# debug option
		self.setdebug(0)
		#only first page
		self.pagenos=set([0])
		self.pageno = 1
		self.outfp = stdmodel()
		self.codec = 'utf-8'
		self.showpageno = True
		self.scale = 1
		self.password = ''
		self.maxpages = 0
		self.rotation = 0
		self.imagewriter = None
		self.laparams = LAParams()	
		self.layoutmode = 'normal'	
	# ResourceManager facilitates reuse of shared resources such as fonts and images so that 
	# large objects are not allocated multiple times.
		#### This will cause some problem when set to default True.
		self.caching = False
		self.rsrcmgr = PDFResourceManager(caching=self.caching)

		# Important Main converter for pdf file
		self.device = TextConverter(self.rsrcmgr, self.outfp, codec=self.codec, 
			laparams=self.laparams, imagewriter=self.imagewriter)

		self.htmldevice = HTMLConverter(self.rsrcmgr, self.outfp, codec=self.codec, scale=self.scale,
                               layoutmode=self.layoutmode, laparams=self.laparams,
                               imagewriter=self.imagewriter)

Esempio n. 9

0

Mostra file

File: scraper.py Progetto: tcrwt/whatsforcaff2

def convert_pdf(path):

    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

    fp = file(path, 'rb')
    process_pdf(rsrcmgr, device, fp)
    fp.close()
    device.close()

    str = retstr.getvalue()
    retstr.close()
    return str

Esempio n. 10

0

Mostra file

File: ipp_menu.py Progetto: timtammittee/ipp_menu

def extract_price_from_pdf(file_name):
    pagenos = set()
    imagewriter = None
    rotation = 0
    codec = 'utf-8'
    caching = True
    laparams = LAParams()
    rsrcmgr = PDFResourceManager(caching=caching)
    outfp = TextReciver()
    device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                               imagewriter=imagewriter)

    #Read the file
    for fname in [file_name]:
        fp = file(fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp,
                                      pagenos,
                                      caching=True,
                                      check_extractable=True):
            page.rotate = (page.rotate+rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()

    #Find all lines that end with a price and include position
    #information. Also find all following lines that include prices
    #but no new location (shorter 100 characters)
    matches = re.finditer('(.*left.*[0-9]{1,2}\.[0-9]{1,2} )'
                          '(\n<br>.{0,100}[0-9]{1,2}\.[0-9]{1,2} *)*',
                          outfp.text)

    pos_list = []
    for m in matches:
        line_group = m.group().split('\n')

        #Extract the position information from the string
        pos_string = re.findall('(.*top:)([0-9]+)(px)', line_group[0])[0]
        ypos = pos_string[1]

        #Iterate over all lines and extract the price. Increment the
        #position slightly for each new line
        for i, price_text in enumerate(line_group):n
            price = float(re.findall('[0-9]{1,2}\.[0-9]{1,2}',
                                     price_text[::-1])[0][::-1])
            ypos= int(ypos) + i
            pos_list.append((ypos, price))

Esempio n. 11

0

Mostra file

File: main.py Progetto: staten12/pdftohtml

def convert_pdf_to_html(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = HTMLConverter(rsrcmgr, retstr, codec = codec, laparams = laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages = maxpages, password = password, caching = caching, check_extractable = True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    str = retstr.getvalue()
    retstr.close()
    return str

Esempio n. 12

0

Mostra file

File: ipp_menu.py Progetto: andrejvoss/ipp_menu

def extract_price_from_pdf(file_name):
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    imagewriter = None
    rotation = 0
    codec = 'utf-8'
    caching = True
    laparams = LAParams()
    rsrcmgr = PDFResourceManager(caching=caching)
    outfp = TextReciver()
    device = HTMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                               imagewriter=imagewriter)

    for fname in [file_name]:
        fp = file(fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp, pagenos,
                                      maxpages=maxpages, password=password,
                                      caching=caching, check_extractable=True):
            page.rotate = (page.rotate+rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()
    matches = re.finditer('(.*left.*[0-9]{1,2}\.[0-9]{1,2} )(\n<br>.{0,100}[0-9]{1,2}\.[0-9]{1,2} *)*',outfp.text)    
    pos_list = []
    for m in matches:
        line_group = m.group().split('\n')
        ypos = re.findall('[0-9]+',re.findall('.*top:[0-9]+px', line_group[0])[0][::-1])[0][::-1]
        for i,price in enumerate(line_group):
            if len(price):
                p = float(re.findall('[0-9]{1,2}\.[0-9]{1,2}',price[::-1])[0][::-1])
                ypos= int(ypos) + i
                
                pos_list.append((ypos, p))
    pos_list.sort()
    pos, price_list = zip(*pos_list)
    
    return price_list

Esempio n. 13

0

Mostra file

File: Extract_PdfMiner.py Progetto: KshitizSethia/AcroDisam

 def get_html(self, path):  # Pulls html from PDF instead of plain text
     if path[-4:] != ".pdf":
         path = path + ".pdf"
     rsrcmgr = PDFResourceManager()
     retstr = StringIO()
     codec = 'utf-8'
     laparams = LAParams()
     device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
     fp = file(path, 'rb')
     interpreter = PDFPageInterpreter(rsrcmgr, device)
     password = ""
     maxpages = 0
     caching = True
     pagenos = set()
     for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True):
         interpreter.process_page(page)
     fp.close()
     device.close()
     result = retstr.getvalue()
     retstr.close()
     return result

Esempio n. 14

0

Mostra file

File: pdf.py Progetto: alabarga/SocialLearning

def convert_pdf_to_html(url):

    r = requests.head(url)
    r.headers["content-type"]

    if 'application/pdf' in r.headers["content-type"]:

        r = requests.get(url)

        # Cast to StringIO object
        from StringIO import StringIO
        memory_file = StringIO(r.content)

        # Create a PDF parser object associated with the StringIO object
        parser = PDFParser(memory_file)

        # Create a PDF document object that stores the document structure
        document = PDFDocument(parser)

        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0 #is for all
        caching = True
        pagenos=set()

        for page in PDFPage.create_pages(document):
            interpreter.process_page(page)

        device.close()
        str = retstr.getvalue()
        retstr.close()
        return str

Esempio n. 15

0

Mostra file

File: pdfscrap.py Progetto: msdark/textParser

class PDF2Txt:
	def __init__(self,pdffile,outfile,output_type='text'):
		PDFDocument.debug = 0
		PDFParser.debug = 0
		CMapDB.debug = 0
		PDFResourceManager.debug = 0
		PDFPageInterpreter.debug = 0
		PDFDevice.debug = 0
		self.rsrcmgr = PDFResourceManager(caching=True)
		self.outtype = output_type
		self.outfile = outfile
		self.pdffile = pdffile

	def convert(self):
		outfp = file(self.outfile,'w')
		if self.outtype == 'text':
			self.device = TextConverter(self.rsrcmgr,outfp,codec='utf-8',laparams=LAParams(),imagewriter=None)
		elif self.outtype == 'xml':
			self.device = XMLConverter(self.rsrcmgr, outfp, codec='utf-8', laparams=LAParams(),
							  imagewriter=None)
		elif self.outtype == 'html':
			self.device = HTMLConverter(self.rsrcmgr, outfp, codec='utf-8', scale=1,
							   layoutmode='normal', laparams=LAParams(),
							   imagewriter=None)
		else:
			print 'Formato de salida no soportado'
			sys.exit(-1)
		fp = file(self.pdffile,'rb')
		interpreter = PDFPageInterpreter(self.rsrcmgr,self.device)
		pagenos = set()
		for page in PDFPage.get_pages(fp,pagenos,caching=True,check_extractable=True):
			page.rotate = (page.rotate) % 360
			interpreter.process_page(page)
		fp.close()
		self.device.close()
		outfp.close()
		print "Archivo %s creado en base a %s" % (self.outfile,self.pdffile)

Esempio n. 16

0

Mostra file

File: pdfhandler.py Progetto: OAPDF/oapdftools

	def reset(self,html=False):
		'''Reset can avoid wrong judge'''
		self.rsrcmgr = PDFResourceManager(caching=self.caching)

		# Important Main converter for pdf file
		
		if (html):
			self.htmldevice.close()
			self.htmldevice = HTMLConverter(self.rsrcmgr, self.outfp, codec=self.codec, scale=self.scale,
		                   layoutmode=self.layoutmode, laparams=self.laparams,
		                   imagewriter=self.imagewriter)
		else:	
			self.device.close()
			self.device = TextConverter(self.rsrcmgr, self.outfp, codec=self.codec, 
			laparams=self.laparams, imagewriter=self.imagewriter)

Esempio n. 17

0

Mostra file

File: pdf2txt.py Progetto: samalws/Questionator3000-ft.-Doofy

def translate(output, args):
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    outfile = output
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=stripcontrol)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                               layoutmode=layoutmode, laparams=laparams,
                               imagewriter=imagewriter, debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    fp = file(args, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp, pagenos,
                                  maxpages=maxpages, password=password,
                                  caching=caching, check_extractable=True):
        page.rotate = (page.rotate+rotation) % 360
        interpreter.process_page(page)
    fp.close()
    device.close()
    outfp.close()
    return

Esempio n. 18

0

Mostra file

class PDFHandler(object):
	'''A PDF Handle class to read contains
	Now also support file object/StringIO object(won't close after process)'''
	def __init__(self):
		# debug option
		self.setdebug(0)
		#only first page
		self.pagenos=set([0])
		self.pageno = 1
		self.outfp = stdmodel()
		self.codec = 'utf-8'
		self.showpageno = True
		self.scale = 1
		self.password = ''
		self.maxpages = 0
		self.rotation = 0
		self.imagewriter = None
		self.laparams = LAParams()	
		self.layoutmode = 'normal'	
	# ResourceManager facilitates reuse of shared resources such as fonts and images so that 
	# large objects are not allocated multiple times.
		#### This will cause some problem when set to default True.
		self.caching = False
		self.rsrcmgr = PDFResourceManager(caching=self.caching)

		# Important Main converter for pdf file
		self.device = TextConverter(self.rsrcmgr, self.outfp, codec=self.codec, 
			laparams=self.laparams, imagewriter=self.imagewriter)

		self.htmldevice = HTMLConverter(self.rsrcmgr, self.outfp, codec=self.codec, scale=self.scale,
                               layoutmode=self.layoutmode, laparams=self.laparams,
                               imagewriter=self.imagewriter)

	def reset(self,html=False):
		'''Reset can avoid wrong judge'''
		self.rsrcmgr = PDFResourceManager(caching=self.caching)

		# Important Main converter for pdf file
		
		if (html):
			self.htmldevice.close()
			self.htmldevice = HTMLConverter(self.rsrcmgr, self.outfp, codec=self.codec, scale=self.scale,
		                   layoutmode=self.layoutmode, laparams=self.laparams,
		                   imagewriter=self.imagewriter)
		else:	
			self.device.close()
			self.device = TextConverter(self.rsrcmgr, self.outfp, codec=self.codec, 
			laparams=self.laparams, imagewriter=self.imagewriter)

	def setdebug(self,value):
		'''Set Debug Information. Especially when init'''
		# debug option
		self.debug = 0
		PDFResourceManager.debug = self.debug
		PDFPageInterpreter.debug = self.debug
		#PDFDocument.debug = self.debug
		#PDFParser.debug = self.debug
		#CMapDB.debug = self.debug
		#PDFDevice.debug = self.debug	

	def GetPageNumber(self,fname,fobj=None):
		'''Get total page number of PDF'''
		if (fobj):
			#fp=StringIO(fobj.read())
			#fobj.seek(0)
			fp=fobj
		else:
			fp = file(fname, 'rb')
		try:
			pageno=0
			for page in PDFPage.get_pages(fp, set(), maxpages=0, 
				password=self.password, caching=self.caching, check_extractable=False):
				pageno+=1
			if fobj:
				fp.seek(0)
			else:
				fp.close()
			return pageno
		except Exception as e:
			print e
			if fobj:
				fp.seek(0)
			else:
				fp.close()
			print "Error Reading PDF page number.."
			return 0

	def FastCheck(self,fname,fobj=None):
		'''Fast check whether has page one'''
		if (fobj):
			fp=fobj
		else:
			fp = file(fname, 'rb')
		try:
			for page in PDFPage.get_pages(fp, set([0]), maxpages=1, 
				password=self.password, caching=self.caching, check_extractable=False):
				break
			if fobj:
				fp.seek(0)
			else:
				fp.close()
			return True
		except Exception as e:
			if fobj:
				fp.seek(0)
			else:
				fp.close()
			print "Error Reading PDF page number..",fname
			return False

	def GetSinglePage(self,fname,pageno=1,html=False,fobj=None):
		'''Get Single Page contents of PDF, return string
		Default first page'''	
		if (fobj):
			fp=fobj
		else:
			fp = file(fname, 'rb')
		try:
			if (html):
				interpreter = PDFPageInterpreter(self.rsrcmgr, self.htmldevice)
			else:
				interpreter = PDFPageInterpreter(self.rsrcmgr, self.device)

			for page in PDFPage.get_pages(fp, set([pageno-1]), maxpages=self.maxpages, 
				password=self.password, caching=self.caching, check_extractable=False):

				page.rotate = (page.rotate+self.rotation) % 360
				interpreter.process_page(page)
			if fobj:
				fp.seek(0)
			else:
				fp.close()
			outstr=self.outfp.get()
			self.outfp.reset()
			return outstr 
		except Exception as e:
			self.outfp.reset()
			if fobj:
				fp.seek(0)
			else:
				fp.close()
			return ""

	def GetPages(self,fname,pagenos=[1],html=False,fobj=None):
		'''Get Several Page contents of PDF, return string
		Default first page'''	
		if (fobj):
			fp=fobj
		else:
			fp = file(fname, 'rb')
		try:
			if (html):
				interpreter = PDFPageInterpreter(self.rsrcmgr, self.htmldevice)
			else:
				interpreter = PDFPageInterpreter(self.rsrcmgr, self.device)

			for page in PDFPage.get_pages(fp, set([i-1 for i in pagenos]), maxpages=self.maxpages, 
				password=self.password, caching=self.caching, check_extractable=False):

				page.rotate = (page.rotate+self.rotation) % 360
				interpreter.process_page(page)
			if fobj:
				fp.seek(0)
			else:
				fp.close()
			outstr=self.outfp.get()
			self.outfp.reset()
			return outstr 
		except Exception as e:
			self.outfp.reset()
			if fobj:
				fp.seek(0)
			else:
				fp.close()
			return ""

	def GetAllPages(self,fname,html=False,fobj=None):
		'''Get All Page contents of PDF, return string'''	
		if (fobj):
			fp=fobj
		else:
			fp = file(fname, 'rb')
		try:
			if (html):
				interpreter = PDFPageInterpreter(self.rsrcmgr, self.htmldevice)
			else:
				interpreter = PDFPageInterpreter(self.rsrcmgr, self.device)

			for page in PDFPage.get_pages(fp, set(), maxpages=self.maxpages, 
				password=self.password, caching=self.caching, check_extractable=False):

				page.rotate = (page.rotate+self.rotation) % 360
				interpreter.process_page(page)
			if fobj:
				fp.seek(0)
			else:
				fp.close()
			outstr=self.outfp.get()
			self.outfp.reset()
			return outstr 
		except Exception as e:
			self.outfp.reset()
			if fobj:
				fp.seek(0)
			else:
				fp.close()
			return ""

Esempio n. 19

0

Mostra file

File: rename_pdf_3k.py Progetto: zhangjungh/work

def pdf_gettext(filepath, reserve):
	# debug option
	debug = 0
	# input option
	password = ''
	pagenos = set()
	maxpages = 0
	# output option
	outfile = 'output.html'
	imagewriter = None
	layoutmode = 'normal'
	codec = 'utf-8'
	scale = 1
	caching = True
	laparams = LAParams()
	firstout = 'firstout.html'
	lastout = 'lastout.html'
	firstpage = None
	lastpage = None
	first = []
	last = []
	
	if False:
		firstout = filepath[:-3] + firstout
		lastout = filepath[:-3] + lastout
		if os.path.exists(firstout):					
			html_textparser(firstout, first)
			if os.path.exists(lastout):
				html_textparser(lastout, last)
			return first, last
	
	rsrcmgr = PDFResourceManager(caching=caching)
	
	#import io
	#outfp = io.StringIO()
	#outfp = io.open(outfile, 'w+t', encoding=codec, errors='ignore')
	import tempfile
	outfp = tempfile.TemporaryFile(mode='w+t', encoding=codec)
	
	device = HTMLConverter(rsrcmgr, outfp, scale=scale, layoutmode=layoutmode, laparams=laparams)
	
	
	with open(filepath, 'rb') as fp:
		#process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
        #            caching=caching, check_extractable=True)
		
		parser = PDFParser(fp)
		doc = PDFDocument(caching=caching)
		parser.set_document(doc)
		doc.set_parser(parser)
		doc.initialize(password)
		if not doc.is_extractable:
			raise Exception('Text extraction is not allowed: %s' % filepath)
			
		interpreter = PDFPageInterpreter(rsrcmgr, device)	
		for page in doc.get_pages():
			if not firstpage:
				firstpage = page
			else:
				lastpage = page

		if firstpage:
			interpreter.process_page(firstpage)
			with open(firstout, 'w', encoding=codec) as f:
				outfp.seek(0)
				f.write(outfp.read())			
			html_textparser(firstout, first)				
		if lastpage:
			outfp.truncate(0)
			interpreter.process_page(lastpage)
			with open(lastout, 'w', encoding=codec) as f:
				outfp.seek(0)
				f.write(outfp.read())
			html_textparser(lastout, last)
		
		
	device.close()	
	outfp.close()

	return first, last

Esempio n. 20

0

Mostra file

File: readPdf.py Progetto: jerry-shijieli/Data-Science-Notes

def readPDF2HTML(pdfFile, opts={}):
    # open a PDF file
    fp = StringIO(pdfFile.read())
    retstr = StringIO()
    # create a PDF parser object associated with the file object
    parser = PDFParser(fp)
    # create a PDF document allows text extraction
    document = PDFDocument(parser) # password if needed
    # check if document allows text extraction without password
    if not document.is_extractable:
        raise PDFTextExtractionNotAllowed
    # create a PDF resource manager object that sotres shared resources
    rsrcmgr = PDFResourceManager()
    # create a PDF device object
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d':
            debug += 1
        elif k == '-p':
            pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m':
            maxpages = int(v)
        elif k == '-P':
            password = v
        elif k == '-o':
            outfile = v
        elif k == '-n':
            laparams = None
        elif k == '-A':
            laparams.all_texts = True
        elif k == '-V':
            laparams.detect_vertical = True
        elif k == '-M':
            laparams.char_margin = float(v)
        elif k == '-L':
            laparams.line_margin = float(v)
        elif k == '-W':
            laparams.word_margin = float(v)
        elif k == '-F':
            laparams.boxes_flow = float(v)
        elif k == '-Y':
            layoutmode = v
        elif k == '-O':
            outdir = v
        elif k == '-t':
            outtype = v
        elif k == '-c':
            codec = v
        elif k == '-s':
            scale = float(v)
    codec = 'utf-8'
    device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    # create a PDF interpreter object
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pagenos = set()
    # process each page contained in the document
    for page in PDFPage.get_pages(fp, pagenos):
        interpreter.process_page(page)
    # close streams and return text content
    fp.close()
    content = retstr.getvalue()
    device.close()
    retstr.close()
    return content

Esempio n. 21

0

Mostra file

File: stmtreader 2012-09-01.py Progetto: mdhatmaker/Trading-python

def convertPDF(outfile,pdfFile):
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    #outfile = None
    outtype = None
    outdir = None
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    """    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': outdir = v
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #"""
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                               layoutmode=layoutmode, laparams=laparams, outdir=outdir)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        pass  #return usage()
    fname = pdfFile  #for fname in args:
    fp = file(fname, 'rb')
    process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
                caching=caching, check_extractable=True)
    fp.close()
    device.close()
    outfp.close()
    return

Esempio n. 22

0

Mostra file

File: pdf2txt.py Progetto: wanshot/trdist

def main(argv):
    import getopt
    def usage():
        print ('usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output] [-C] '
               '[-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin] [-F boxes_flow] '
               '[-Y layout_mode] [-O output_dir] [-t text|html|xml|tag] [-c codec] [-s scale] file ...' % argv[0])
        return 100
    try:
        (opts, args) = getopt.getopt(argv[1:], 'dp:m:P:o:CnAVM:L:W:F:Y:O:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    outdir = None
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': outdir = v
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, outdir=outdir)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr, outfp, codec=codec, scale=scale,
                               layoutmode=layoutmode, laparams=laparams, outdir=outdir)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file(fname, 'rb')
        process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
                    caching=caching, check_extractable=True)
        fp.close()
    device.close()
    outfp.close()
    return

Esempio n. 23

0

Mostra file

File: myPDF2txt.py Progetto: rizkiagungpermana/Converter-pdf-files-to-.txt-or-.html

import sys, getopt
 
#converts pdf, returns its text content as a string
def convert(case,fname, pages=None):
    if not pages: pagenums = set();
    else:         pagenums = set(pages);      
    manager = PDFResourceManager() 
    codec = 'utf-8'
    caching = True
 
    if case == 'text' :
        output = io.StringIO()
        converter = TextConverter(manager, output, codec=codec, laparams=LAParams())     
    if case == 'HTML' :
        output = io.BytesIO()
        converter = HTMLConverter(manager, output, codec=codec, laparams=LAParams())
 
    interpreter = PDFPageInterpreter(manager, converter)   
    infile = open(fname, 'rb')
 
    for page in PDFPage.get_pages(infile, pagenums,caching=caching, check_extractable=True):
        interpreter.process_page(page)
 
    convertedPDF = output.getvalue()  
 
    infile.close(); converter.close(); output.close()
    return convertedPDF
 
def convert_pdf_to_txt(path_to_file):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()

Esempio n. 24

0

Mostra file

def main(argv):
  import getopt
  def usage():
    print ('usage: %s [-d] [-p pagenos] [-P password] [-c codec] '
           '[-D direction] [-M char_margin] [-L line_margin] [-W word_margin] '
           '[-t text|html|sgml|tag] [-o output] file ...' % argv[0])
    return 100
  try:
    (opts, args) = getopt.getopt(argv[1:], 'dp:P:c:D:M:L:W:t:o:C:D:m:')
  except getopt.GetoptError:
    return usage()
  if not args: return usage()
  # debug option
  debug = 0
  # path option
  cmapdir = find_cmap_path()
  # input option
  password = ''
  pagenos = set()
  maxpages = 0
  # output option
  outfile = None
  outtype = None
  codec = 'utf-8'
  pageno = 1
  scale = 1
  showpageno = True
  laparams = LAParams()
  for (k, v) in opts:
    if k == '-d': debug += 1
    elif k == '-C': cmapdir = v
    elif k == '-P': password = v
    elif k == '-p': pagenos.update( int(x)-1 for x in v.split(',') )
    elif k == '-m': maxpages = int(v)
    elif k == '-t': outtype = v
    elif k == '-c': codec = v
    elif k == '-o': outfile = v
    elif k == '-s': scale = float(v)
    elif k == '-D': laparams.direction = v
    elif k == '-M': laparams.char_margin = float(v)
    elif k == '-L': laparams.line_margin = float(v)
    elif k == '-W': laparams.word_margin = float(v)
  #
  CMapDB.debug = debug
  PDFResourceManager.debug = debug
  PDFDocument.debug = debug
  PDFParser.debug = debug
  PDFPageInterpreter.debug = debug
  PDFDevice.debug = debug
  #
  CMapDB.initialize(cmapdir)
  rsrc = PDFResourceManager()
  if not outtype:
    outtype = 'text'
    if outfile:
      if outfile.endswith('.htm') or outfile.endswith('.html'):
        outtype = 'html'
      elif outfile.endswith('.sgml'):
        outtype = 'sgml'
      elif outfile.endswith('.tag'):
        outtype = 'tag'
  if outfile:
    outfp = file(outfile, 'w')
  else:
    outfp = sys.stdout
  if outtype == 'text':
    device = TextConverter(rsrc, outfp, codec=codec, laparams=laparams)
  elif outtype == 'sgml':
    device = SGMLConverter(rsrc, outfp, codec=codec, laparams=laparams)
  elif outtype == 'html':
    device = HTMLConverter(rsrc, outfp, codec=codec, scale=scale, laparams=laparams)
  elif outtype == 'tag':
    device = TagExtractor(rsrc, outfp, codec=codec)
  else:
    return usage()
  for fname in args:
    fp = file(fname, 'rb')
    process_pdf(rsrc, device, fp, pagenos, maxpages=maxpages, password=password)
    fp.close()
  device.close()
  return

Esempio n. 25

0

Mostra file

File: PdfWordStat.py Progetto: linxiaohui/CodeRepoPy

def ConvertPdf(pdfpath, outfp, opts={}):
    import sys
    from pdfminer.pdfdocument import PDFDocument
    from pdfminer.pdfparser import PDFParser
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.pdfdevice import PDFDevice, TagExtractor
    from pdfminer.pdfpage import PDFPage
    from pdfminer.converter import XMLConverter, HTMLConverter, TextConverter
    from pdfminer.cmapdb import CMapDB
    from pdfminer.layout import LAParams
    from pdfminer.image import ImageWriter

    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-R': rotation = int(v)
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFDocument.debug = debug
    PDFParser.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager()
    if not outtype:
        outtype = 'txt'
    if outtype == 'txt':
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              codec=codec,
                              laparams=laparams,
                              imagewriter=imagewriter)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    fp = file(pdfpath, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        page.rotate = (page.rotate + rotation) % 360
        interpreter.process_page(page)
    fp.close()
    device.close()

    return True

Esempio n. 26

0

Mostra file

File: paperminer.py Progetto: leon0707/conf-panda

def decode_pdf(filename):
    global current_section
    global pre_section
    global pre_font_family
    global pre_font_size
    global title
    global authors
    global abstract
    global keywords

    current_section = ""
    pre_section = TAG_BEGIN
    pre_font_family = ""
    pre_font_size = ""
    title = ""
    authors = set()
    abstract = ""
    keywords = ""

    path = basedir + "/static/demos/paperminer/papers/" + filename
    # layout parameters
    laparams = LAParams()
    caching = True
    rsrcmgr = PDFResourceManager(caching=caching)
    outtype = 'html'
    out = StringIO()
    # Opens a file for reading only in binary format. The file pointer is
    # placed at the beginning of the file. This is the default mode.
    fp = file(path, 'rb')

    # parse PDF to HTML
    codec = 'utf-8'
    if outtype == 'text':
        device = TextConverter(rsrcmgr,
                               out,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=None)
    if outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              out,
                              codec=codec,
                              laparams=laparams,
                              imagewriter=None)
    if outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               out,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=None)

    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    pagenos = set()
    # only process the first page
    max_page = 1
    p = 0
    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=max_page,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        if p >= max_page:
            break
        interpreter.process_page(page)
    fp.close()
    device.close()
    # str_value is the first PDF page in HTML
    str_value = out.getvalue()
    out.close()

    # loop through each line in HTML
    for line in str_value.split('<br>'):
        analyze(line)
    result = [
        title.decode('utf-8'), authors,
        abstract.decode('utf-8'),
        keywords.decode('utf-8')
    ]

    return result

Esempio n. 27

0

Mostra file

def main(argv):
    import getopt

    def usage():
        print 'Syntax:\npdf2htm.exe SourcePDF\n where the parameter is either a file name or\na wildcard spec like\n*.pdf\nEnclose it with quotes if it contains a space\n\nAdditional options are supported with named command line parameters as follows:'
        print(
            'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
            ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
            ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]'
            ' [-t text|html|xml|tag] [-c codec] [-s scale]'
            ' file ...' % argv[0])
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:],
                                     'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = 'tag'
    imagewriter = None
    rotation = 0
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = False
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-R': rotation = int(v)
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'tag'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout

    for fname in args:
        l = glob.glob(fname)
        count = len(l)
        print 'Converting ' + str(
            count) + ' from ' + fname + ' to ' + outtype + ' format'
        for pdf in l:
            #             print pdf
            d = {'html': 'htm', 'tag': 'tag', 'text': 'txt', 'xml': 'xml'}
            ext = '.' + d[outtype]
            outfile = pdf[0:-4] + ext
            print outfile
            outfp = file(outfile, 'wb')
            if outtype == 'text':
                device = TextConverter(rsrcmgr,
                                       outfp,
                                       codec=codec,
                                       laparams=laparams,
                                       imagewriter=imagewriter)
                device.showpageno = False
            elif outtype == 'xml':
                device = XMLConverter(rsrcmgr,
                                      outfp,
                                      codec=codec,
                                      laparams=laparams,
                                      imagewriter=imagewriter)
                device.showpageno = False
            elif outtype == 'html':
                device = HTMLConverter(rsrcmgr,
                                       outfp,
                                       codec=codec,
                                       scale=scale,
                                       layoutmode=layoutmode,
                                       laparams=laparams,
                                       imagewriter=imagewriter)
                device.showpageno = False
            elif outtype == 'tag':
                device = TagExtractor(rsrcmgr, outfp, codec=codec)
                device.showpageno = False
            else:
                return usage()

            fp = file(pdf, 'rb')
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.get_pages(fp,
                                          pagenos,
                                          maxpages=maxpages,
                                          password=password,
                                          caching=caching,
                                          check_extractable=True):
                page.rotate = (page.rotate + rotation) % 360
                interpreter.process_page(page)
            fp.close()
            device.close()
            outfp.close()

        print 'Done'
    return

Esempio n. 28

0

Mostra file

File: my_pdf2txt.py Progetto: Vadim88226/cv-parser

def main(argv):

    import getopt

    def usage():
        print(
            f'usage: {argv[0]} [-P password] [-o output] [-t text|html|xml|tag]'
            ' [-O output_dir] [-c encoding] [-s scale] [-R rotation]'
            ' [-Y normal|loose|exact] [-p pagenos] [-m maxpages]'
            ' [-S] [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin]'
            ' [-W word_margin] [-F boxes_flow] [-d] input.pdf ...')
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:],
                                     'dP:o:t:O:c:s:R:Y:p:m:SCnAVM:W:L:F:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = b''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = None
    outtype = None
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    encoding = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-P': password = v.encode('ascii')
        elif k == '-o': outfile = v
        elif k == '-t': outtype = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-c': encoding = v
        elif k == '-s': scale = float(v)
        elif k == '-R': rotation = int(v)
        elif k == '-Y': layoutmode = v
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-S': stripcontrol = True
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = open(outfile, 'w', encoding=encoding)
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr,
                               outfp,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              laparams=laparams,
                              imagewriter=imagewriter,
                              stripcontrol=stripcontrol)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter,
                               debug=debug)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp)
    else:
        return usage()
    for fname in args:
        with open(fname, 'rb') as fp:
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            for page in PDFPage.get_pages(fp,
                                          pagenos,
                                          maxpages=maxpages,
                                          password=password,
                                          caching=caching,
                                          check_extractable=True):
                page.rotate = (page.rotate + rotation) % 360
                interpreter.process_page(page)
    device.close()
    outfp.close()
    return

Esempio n. 29

0

Mostra file

File: preprocessing.py Progetto: HAPPY-CHANDRU-RAJU/resume-analyser

    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words


def convert_to_html(case, fname, pages=None):
    if not pages: pagenums = set()
    else: pagenums = set(pages)
    manager = PDFResourceManager()
    codec = 'utf-8'
    caching = True

    if case == 'HTML':
        output = io.BytesIO()
        converter = HTMLConverter(manager, output, laparams=LAParams())

    interpreter = PDFPageInterpreter(manager, converter)
    infile = open(fname, 'rb')

    for page in PDFPage.get_pages(infile,
                                  pagenums,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    convertedPDF = output.getvalue()

    infile.close()
    converter.close()
    output.close()

Esempio n. 30

0

Mostra file

File: braille.py Progetto: Prawnja167/PDF2Braille

def convert(argv):
    def usage():
        print(
            'usage: %s [-d] [-p pagenos] [-m maxpages] [-P password] [-o output]'
            ' [-C] [-n] [-A] [-V] [-M char_margin] [-L line_margin] [-W word_margin]'
            ' [-F boxes_flow] [-Y layout_mode] [-O output_dir] [-R rotation]'
            ' [-t text|html|xml|tag] [-c codec] [-s scale]'
            ' file ...' % argv[0])
        return 100

    try:
        (opts, args) = getopt.getopt(argv[1:],
                                     'dp:m:P:o:CnAVM:L:W:F:Y:O:R:t:c:s:')
    except getopt.GetoptError:
        return usage()
    if not args: return usage()
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    outfile = 'inputs/' + sys.argv[1].replace(' ', '')[:-4] + '.txt'
    outtype = None
    imagewriter = None
    rotation = 0
    layoutmode = 'normal'
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()
    for (k, v) in opts:
        if k == '-d': debug += 1
        elif k == '-p': pagenos.update(int(x) - 1 for x in v.split(','))
        elif k == '-m': maxpages = int(v)
        elif k == '-P': password = v
        elif k == '-o': outfile = v
        elif k == '-C': caching = False
        elif k == '-n': laparams = None
        elif k == '-A': laparams.all_texts = True
        elif k == '-V': laparams.detect_vertical = True
        elif k == '-M': laparams.char_margin = float(v)
        elif k == '-L': laparams.line_margin = float(v)
        elif k == '-W': laparams.word_margin = float(v)
        elif k == '-F': laparams.boxes_flow = float(v)
        elif k == '-Y': layoutmode = v
        elif k == '-O': imagewriter = ImageWriter(v)
        elif k == '-R': rotation = int(v)
        elif k == '-t': outtype = v
        elif k == '-c': codec = v
        elif k == '-s': scale = float(v)
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    if not outtype:
        outtype = 'text'
        if outfile:
            if outfile.endswith('.htm') or outfile.endswith('.html'):
                outtype = 'html'
            elif outfile.endswith('.xml'):
                outtype = 'xml'
            elif outfile.endswith('.tag'):
                outtype = 'tag'
    if outfile:
        outfp = file(outfile, 'w')
    else:
        outfp = sys.stdout
    if outtype == 'text':
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'xml':
        device = XMLConverter(rsrcmgr,
                              outfp,
                              codec=codec,
                              laparams=laparams,
                              imagewriter=imagewriter)
    elif outtype == 'html':
        device = HTMLConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               scale=scale,
                               layoutmode=layoutmode,
                               laparams=laparams,
                               imagewriter=imagewriter)
    elif outtype == 'tag':
        device = TagExtractor(rsrcmgr, outfp, codec=codec)
    else:
        return usage()
    for fname in args:
        fp = file('pdfs/' + fname, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp,
                                      pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            page.rotate = (page.rotate + rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()
    outfp.close()

    #read converted file
    y = open("inputs/" + sys.argv[1].replace(' ', '')[:-4] + '.txt', "r")
    output = brl.translate(y.read())
    #convert into Grade 2 Braille unicode
    x = brl.toUnicodeSymbols(output, flatten=True)
    #save to results folder in .txt format
    text_file = open(
        "results/" + sys.argv[1].replace(' ', '')[:-4] + "-Braille.txt", "w")
    text_file.write(x.encode(codec))
    text_file.close()