Beispiel #1
0
    def pdf2txt(self, lowerBorder=-1, upperBorder=-1):
        """
        Returns the plain text of the document. If lowerBorder is an int number > -1, only
        page referring to this number will be returned. If lowerBorder and upperBorder are >-1
        and upperBorder > lowerBoder, the pages referring to that range will be returned.  
        """
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        fp = file(self.filename, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        if (lowerBorder==-1 and upperBorder==-1) or (lowerBorder>-1 and upperBorder=="max"):
            pagenos=set()
        elif lowerBorder > -1 and upperBorder==-1:
            #extract only a single page
            pagenos=set(range(lowerBorder, lowerBorder+1))
        elif lowerBorder==-1 or upperBorder==-1 or lowerBorder > upperBorder:
            raise ValueError("illegal parameter passed")
        else:
            pagenos=set(range(lowerBorder, upperBorder+1))

        for (pageno, page) in enumerate(PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True)):
            if pageno < lowerBorder and upperBorder == "max":
                continue
            interpreter.process_page(page)
        fp.close()
        device.close()
        s = retstr.getvalue()
        retstr.close()
        return s.decode('utf-8')
Beispiel #2
0
def convert(url, pages=None):
    assert isinstance(url, basestring)
    assert pages == None or isinstance(pages, list)

    rscmng = PDFResourceManager()
    retstr = StringIO()
    device = TextConverter(rscmng, retstr, codec='utf-8', laparams=LAParams())
    web_page = urllib2.urlopen(urllib2.Request(url))
    fp = StringIO(web_page.read())
    interpreter = PDFPageInterpreter(rscmng, device)

    pdf_pages = PDFPage.get_pages(
        fp,
        set(pages if pages != None else []),
        maxpages=0,
        password='',
        caching=True,
        check_extractable=True
    )

    for page in pdf_pages:
        interpreter.process_page(page)

    result = retstr.getvalue()

    fp.close()
    web_page.close()
    device.close()
    retstr.close()

    return result
Beispiel #3
0
    def parse(self, path):
		out = StringIO.StringIO()
		fp = None
        # Directory
		if os.path.isdir(path):
			raise NotImplementedError()
        # File
	       	else:
			fp = file(path)		
		rsrc = PDFResourceManager()
		codec = 'utf-8'
		laparams = LAParams()
		laparams.char_margin = 2.0
		laparams.line_margin = 2.0
		laparams.word_margin = 0.0
		device = TextConverter(rsrc, out, codec=codec, laparams=laparams)
		doc = PDFDocument()
		parser = PDFParser(fp)
		parser.set_document(doc)
		doc.set_parser(parser)
		doc.initialize()
		interpreter = PDFPageInterpreter(rsrc, device)
		for page in doc.get_pages():
			interpreter.process_page(page)
		device.close()
		sample = Sample(path, None, out.getvalue())
		out.close()
		return sample
Beispiel #4
0
def convert(fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = file(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        # a = page.contents[0].rawdata
        # print ('u', a)
        # print
        # splitData = a.split('\n')
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close()
    # print ('u', text)
    # print
    # print(text)
    return text
Beispiel #5
0
def get_pdf_text(path):
    """ Reads a pdf file and returns a dict of the text where the
        index represents the page number.
        http://stackoverflow.com/a/20905381
    """
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    # change to to utf-8 if the text comes out garbled
    codec = 'ascii'
    #codec = 'utf-8'
    laparams = LAParams()
    pages = {}
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams, showpageno=True, pages=pages)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    retstr.close()
    return pages
    def convert_pdf_to_txt(self, path):
        """
        A very simple conversion function
        which returns text for parsing from PDF.

        path = The path to the file
        """
        try:
            rsrcmgr = PDFResourceManager()
            retstr = StringIO()
            codec = 'utf-8'
            laparams = LAParams()
            device = TextConverter(
                rsrcmgr, retstr, codec=codec, laparams=laparams)
            fp = file(path, 'rb')
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            password = ""
            maxpages = 0
            caching = True
            pagenos = set()
            for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching,
                                          check_extractable=True):
                interpreter.process_page(page)
            text = retstr.getvalue()
            fp.close()
            device.close()
            retstr.close()
            return text
        except Exception as e:
            text = ""
            return text
            self.logger.error(
                "Failed to PDF to text: " + str(e))
def convert_pdf_to_txt(path, output):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()

    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()
    fp.close()
    device.close()
    retstr.close()

    f = open(output, 'wb')
    f.write(text)
    f.close()
    return text
Beispiel #8
0
def convert_pdf_to_txt(path): 
	## TAKEN FROM STACK OVERFLOW
	## see... http://www.unixuser.org/~euske/python/pdfminer/programming.html for tutorial
	## Also see... https://github.com/dpapathanasiou/pdfminer-layout-scanner/blob/master/layout_scanner.py
	rsrcmgr = PDFResourceManager()
	retstr = StringIO()
	codec = 'utf-8'
	laparams = LAParams()

	fp = file(path, 'rb')
	password = ""
	maxpages = 0
	caching = True
	pagenos=set()

	# Read text from pages
	device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)	
	interpreter = PDFPageInterpreter(rsrcmgr, device)	
	for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
		interpreter.process_page(page)
	str = retstr.getvalue()

	fp.close()
	device.close()
	retstr.close()

	return str
Beispiel #9
0
	def getTexts(self):
		try:
			password =''
			pagenos = set()
			maxpages = 0
			codec = 'utf-8'
			caching = True
			laparams = LAParams()
			rsrcmgr = PDFResourceManager(caching=caching)
			outfp = file('temppdf.txt','w')
			device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
			fname= self.fname
			fp = file(fname, 'rb')
			process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True)
			fp.close()
			device.close()
			outfp.close()
			infp = file('temppdf.txt','rb')
			test=infp.read()
			infp.close()
			os.remove('temppdf.txt')
			self.text=test
			return "ok"
		except Exception,e:
			return e
Beispiel #10
0
def convert_pdf_to_txt(path):
	rsrcmgr = PDFResourceManager()
	retstr = StringIO()
	codec = 'utf-8'
	laparams = LAParams()

	device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
	fp = file(path, 'rb')
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	password = ""
	maxpages = 120
	caching = True
	pagenos=set()
	# print "two"

	for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
		interpreter.process_page(page)
	# print "one"

	try:
		fp.close()
		device.close()
		str = retstr.getvalue()
		retstr.close()
	except:
		str = retstr.getvalue()

	return str
def convert_pdf(path='provide path here', format='text', codec='utf-8'):
    rsrcmgr = PDFResourceManager()
    retstr = BytesIO()
    laparams = LAParams()
    if format == 'text':
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    else:
        raise ValueError('Please provide the format to extract')
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    maxpages = 500 #mention the maximum pages here (Note: Large number of pages will decrease the performance.)
    caching = True
    page_numbers=set()
    for page in PDFPage.get_pages(fp, page_numbers, maxpages=maxpages,caching=caching, check_extractable=True):
        interpreter.process_page(page)
    text = retstr.getvalue().decode()
    fp.close()
    device.close()
    retstr.close()
    bulletins_data = re.findall('•([^•]+)*', str(text))
    list_of_bullet_points = []
    json_dict = {}
    for points in bulletins_data:
        list_of_bullet_points.append(points)
    json_dict['bulletins'] = list_of_bullet_points
    json_data= json.dumps(json_dict)
    parsed = json.loads(json_data)
    final_data = json.dumps(parsed, indent=4, sort_keys=True) #creates a pretty json with the data extracted
    document = Document()  # creates a new document
    document.add_heading('Bulletins data in the PDF')
    document.add_paragraph(str(final_data))
    document.save('json_data.docx')  # saves it to the filesystem
    os.startfile("json_data.docx")  # will open the file
    return ''
Beispiel #12
0
def pdf_read(pdf):
    """
    Use PDFMiner to extract text from pdf file.
    <PDFMiner even though more low-level but pretty good tool to read pdfs>

    Args:
        *pdf* (str) -- path to pdf file

    Returns:
        *text* (str) -- a text extracted from pdf

    """
    # initalizing objects
    res_manager = PDFResourceManager()
    strio = StringIO()
    lps = LAParams()
    device = TextConverter(res_manager, strio, codec='utf-8', laparams=lps)
    interpreter = PDFPageInterpreter(res_manager, device)
    # opening a pdf file with 'rb' mode for reading binary files
    pdf_file = file(pdf, 'rb')
    for page in PDFPage.get_pages(pdf_file, maxpages=0, password='',
                                  caching=True, check_extractable=True):
        interpreter.process_page(page)
    # finishing up
    pdf_file.close()
    device.close()
    text = strio.getvalue()
    strio.close()
    return text
Beispiel #13
0
    def run(path):
        print "Calling parser :%s" % path

        t0 = time.clock()

        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        fp = file(path, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()
        book = Book()
        i = 0
        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching,
                                      check_extractable=True):
            page_tmp = Page()
            begin_page = len(retstr.getvalue())
            interpreter.process_page(page)
            page_tmp.text = retstr.getvalue()[begin_page:-1]
            book.pages.append(page_tmp)
        fp.close()
        device.close()
        retstr.close()
        print "Parsing in:", time.clock() - t0
        return book
Beispiel #14
0
def get_text(path):
    txt_path = path + '.txt'
    
    if (os.path.isfile(txt_path)):
        return open(txt_path).read()
    
    path = path + '.pdf'
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    str = retstr.getvalue()
    retstr.close()
    
    write_text(txt_path, str)
    
    return str
Beispiel #15
0
def convert_pdf_to_txt(path):
    """
    Converts PDF to text using the pdfminer library
    """
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = "utf-8"
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    file_handle = file(path, "rb")
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(
        file_handle, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True
    ):
        interpreter.process_page(page)

    text = retstr.getvalue()

    file_handle.close()
    device.close()
    retstr.close()
    return text
Beispiel #16
0
def pdf_from_url_to_txt(url, maxpages=0):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    # Open the url provided as an argument to the function and read the content
    f = urllib2.urlopen(urllib2.Request(url)).read()
    # Cast to StringIO object
    fp = StringIO(f)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    caching = True
    pagenos = set()
    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    string = retstr.getvalue()
    retstr.close()
    return string
Beispiel #17
0
def pdf_to_text(pdfname):
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.pdfpage import PDFPage
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams

    from cStringIO import StringIO

    # PDFMiner boilerplate
    rsrcmgr = PDFResourceManager()
    sio = StringIO()
    # codec = 'utf-8'
    codec = 'ascii'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, sio, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Extract text
    fp = file(pdfname, 'rb')
    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
    fp.close()

    # Get text from StringIO
    text = sio.getvalue()

    # Cleanup
    device.close()
    sio.close()

    return text
Beispiel #18
0
def edit_file(fname, pages=None):
	if 'log_in' not in session.keys() :
		return redirect(url_for('index'))
	#f = send_from_directory(app.config['UPLOAD_FOLDER'],fname)
	filename = fname
	fname = os.path.join(app.config['UPLOAD_FOLDER'], fname)
	exten = fname.split('.')[1]
	print exten
	if exten != 'pdf' :
		f = open(fname, 'rb').read()
		text = f
	else :
		if not pages:
		    pagenums = set()
		else:
		    pagenums = set(pages)

		output = StringIO()
		manager = PDFResourceManager()
		converter = TextConverter(manager, output, laparams=LAParams())
		interpreter = PDFPageInterpreter(manager, converter)

		infile = file(fname, 'rb')
		for page in PDFPage.get_pages(infile, pagenums):
		    interpreter.process_page(page)
		infile.close()
		converter.close()
		text = output.getvalue()
		output.close
	print filename
	return '<!doctype html><title>Edit File</title><h1>Upload new File</h1><form action="/save" method=post><p><textarea name="contents" rows=30 cols = 150 autofocus>' + text +'</textarea><br /><input type=hidden name=filename value=' + str(filename) + '> <input type=submit value=Upload></form></html>'
def convert_pdf_to_txt(path):

    temp = os.path.splitext(path)

    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = "utf-8"
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, "rb")
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(
        fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True
    ):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()

    outputFile = temp[0] + ".txt"
    print outputFile

    ff = open(outputFile, "w")
    ff.write(text)
    ff.close()
def pdf2txt(path):
    '''
    Converts a given PDF to plain text in UTF8.
    '''

    try:
        rsrcMgr = PDFResourceManager()
        retStr = StringIO()
        codec = 'utf-8'
        laParams = LAParams()
        device = TextConverter(rsrcMgr, retStr, codec=codec, laparams=laParams)
        fp = file(path, 'rb')
        interpreter = PDFPageInterpreter(rsrcMgr, device)
        password = ""
        maxPages = 0
        caching = True
        pageNos=set()
        for page in PDFPage.get_pages(fp,pageNos,maxpages=maxPages,password=password,caching=caching,check_extractable=True):
            interpreter.process_page(page)
        fp.close()
        device.close()
        text = retStr.getvalue()
        retStr.close()

        return text
    except:
        return None
Beispiel #21
0
def pdf_to_txt(path):
    """converts pdf into a string
    @param path: path to the file
    @type path: string
    
    @return: pdf content
    @rtype: string"""
    
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    s = retstr.getvalue()
    retstr.close()
    return s.replace('\x0c','')
def extract_text_from_pdf(pdf_filename):
    """
    Function to extract the text from pdf documents using pdfminer

    Parameters:
    -----------
    pdf_filename -- string
        File name of the pdf document as string

    Returns:
    --------
    extracted_text -- string
        Text extracted from pdf as string
    """

    resource_manager = PDFResourceManager()
    return_string = StringIO()
    la_params = LAParams()
    device = TextConverter(resource_manager, return_string, codec='utf-8', laparams=la_params)
    fp = file(pdf_filename, 'rb')
    interpreter = PDFPageInterpreter(resource_manager, device)
    page_nos = set()

    for page in PDFPage.get_pages(fp, page_nos):
        interpreter.process_page(page)
    fp.close()

    device.close()
    extracted_text = return_string.getvalue()
    return_string.close()

    return extracted_text
Beispiel #23
0
def pdfconvert(infullpath, file, outfullpath, pages=None):         #Handle PDF
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)
    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)
    pdffile = open(infullpath, 'rb')
    for page in PDFPage.get_pages(pdffile, pagenums):
        interpreter.process_page(page)
    pdffile.close()
    converter.close()
    txtfilename = file

    jpgfile = os.path.splitext(outfullpath)[0] + '.jpg'
    txtfile = os.path.splitext(outfullpath)[0] + '.txt'
    string.replace(txtfile, ' ', '_')
    string.replace(txtfile, '(', '_')
    string.replace(txtfile, ')', '_')
    text = output.getvalue()
    output.close
    temp = open(txtfile, 'w')
    temp.write (text)
    temp.close()

    imagemagick_string = 'convert ' + '"' + infullpath + '" "' + jpgfile + '"'
    os.system(imagemagick_string)
Beispiel #24
0
def pdf_to_txt(fichero_pdf,fichero_txt):    

    # Especificamos la configuracion de nuestro pdf
    password = ''
    pagenos = set()
    maxpages = 0

    imagewriter = None
    rotation = 0
    codec = 'utf-8'
    caching = True
    laparams = LAParams()

    # Estrablecemos el gestor
    rsrcmgr = PDFResourceManager(caching=caching)
       
    # Creamos el fichero de salida y lingamos el dispositivo que lo transforma
    outfp = file(fichero_txt, 'w')
    device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter)
    
    # Para cada pagina del fichero pdf vamos interpretandola mediante el dispositivo
    fp = file(fichero_pdf, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True):
        page.rotate = (page.rotate+rotation) % 360
        interpreter.process_page(page)
        
    # Cerramos los dispositivos abiertos
    fp.close()
    device.close()
    outfp.close()
    
    return 1
Beispiel #25
0
    def __convert(self, ifile, ofile=None):
        fp = file(ifile, 'rb')

        if ofile is None:
            outfp = StringIO.StringIO()
        else:
            outfp = file(ofile, 'wb')

        rsrcmgr = PDFResourceManager(caching=self.caching)
        device = TextConverter(rsrcmgr, outfp, codec=self.codec, laparams=self.laparams,
                               imagewriter=self.imagewriter)

        interpreter = PDFPageInterpreter(rsrcmgr, device)
        try:
            for page in PDFPage.get_pages(fp, self.pagenos,
                                          maxpages=self.maxpages, password=self.password,
                                          caching=self.caching, check_extractable=True):
                page.rotate = (page.rotate + self.rotation) % 360
                interpreter.process_page(page)
        except (PDFException, MemoryError) as e:
            print "Could not extract text {0}".format(e)
        fp.close()
        device.close()
        retval = None
        if ofile is None:
            retval = outfp.getvalue()

        outfp.close()
        return retval
Beispiel #26
0
def pdf_to_txt(path, lowerBorder=-1, upperBorder=-1):
   rsrcmgr = PDFResourceManager()
   retstr = StringIO()
   codec = 'utf-8'
   laparams = LAParams()
   device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
   fp = file(path, 'rb')
   interpreter = PDFPageInterpreter(rsrcmgr, device)
   password = ""
   maxpages = 0
   caching = True
   if lowerBorder==-1 and upperBorder==-1:
      pagenos=set()
   else:
      if lowerBorder==-1 or upperBorder==-1 or lowerBorder > upperBorder:
         raise ValueError("illegal parameter passed")
      else:
         pagenos=set(range(lowerBorder, upperBorder+1))
   for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
      interpreter.process_page(page)
   fp.close()
   device.close()
   s = retstr.getvalue()
   retstr.close()
   return s.decode('utf-8')
Beispiel #27
0
def pdfconvert(infullpath, file, infolder, pages=None):         #Handle PDF
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)
    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)
    pdffile = open(infullpath, 'rb')
   # print "pdffile=", pdffile
    for page in PDFPage.get_pages(pdffile, pagenums):
        interpreter.process_page(page)
    pdffile.close()
    converter.close()
    txtfilename = file
    jpgfile = infolder + str(txtfilename) + '.jpg'
    txtfile = corpuspath + corpusfolder + '/' + txtfilename + '.txt'

    text = output.getvalue()
    output.close
    temp = open(txtfile, 'w')
    temp.write (text)
    temp.close()

    imagemagick_string = 'convert ' + '"' + infullpath + '" "' + jpgfile + '"'
    os.system(imagemagick_string)

    return jpgfile
Beispiel #28
0
def pdf_to_text(pdf):
    pagenos = set()
    maxpages = 0
    # output option
    rotation = 0
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()

    rsrcmgr = PDFResourceManager(caching=caching)
    outtype = 'text'
    retstr = BytesIO()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = pdf
    if isinstance(pdf, str):
        fp = open(pdf, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp, pagenos,
                                  maxpages=maxpages,
                                  caching=caching, check_extractable=True):
        page.rotate = (page.rotate+rotation) % 360
        interpreter.process_page(page)
    fp.close()
    device.close()
    result = retstr.getvalue()
    print(result)
    return result
def convert_pdf_to_txt(path):
    """
    This function converts a .pdf file to text
    @path: file path to .pdf document

    from: http://stackoverflow.com/questions/26494211/
    extracting-text-from-a-pdf-file-using-pdfminer-in-python/26495057#26495057

    """
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,
                                  password=password, caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text
Beispiel #30
0
def convert_pdf_to_txt(path):

    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()

    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    with open(path, 'rb') as fp:
        parser = PDFParser(fp)

        doc = PDFDocument(caching=True)
        parser.set_document(doc)
        doc.set_parser(parser)
        doc.initialize('')

        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.

        for page in doc.get_pages():
            interpreter.process_page(page)
        text = retstr.getvalue()

    device.close()
    retstr.close()

    return text
def extract_text_from_pdf(pdf_path):
    '''
    read pdf file into string type
    '''
    resource_manager = PDFResourceManager()
    fake_file_handle = io.StringIO()
    converter = TextConverter(resource_manager, fake_file_handle)
    page_interpreter = PDFPageInterpreter(resource_manager, converter)

    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, caching=True,
                                      check_extractable=True):
            page_interpreter.process_page(page)

        text = fake_file_handle.getvalue()

    # close open handles
    converter.close()
    fake_file_handle.close()

    if text:
        return text
Beispiel #32
0
def convert_pdf_to_txt(r, max_pages=3):
    text = None

    rsrcmgr = PDFResourceManager()
    retstr = BytesIO()
    codec = 'utf-8'
    laparams = LAParams()

    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

    if r.status_code != 200:
        logger.info(u"error: status code {} in convert_pdf_to_txt".format(
            r.status_code))
        return None

    if not r.encoding:
        r.encoding = "utf-8"
    fp = StringIO(r.content_big())

    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    caching = True
    pagenos = set()
    pages = PDFPage.get_pages(fp,
                              pagenos,
                              maxpages=max_pages,
                              password=password,
                              caching=caching,
                              check_extractable=True)

    for page in pages:
        interpreter.process_page(page)

    text = retstr.getvalue()

    device.close()
    retstr.close()
    # logger.info(text)
    return text
Beispiel #33
0
def parse_pdf(fname, outfile):
    # input option
    password = b''
    pagenos = set()
    maxpages = 0
    # output option
    #outfile = fname + '.txt'
    outtype = 'text'
    imagewriter = None
    rotation = 0
    stripcontrol = False
    layoutmode = 'normal'
    encoding = 'utf-8'
    pageno = 0
    scale = 1
    caching = True
    showpageno = False
    laparams = LAParams()
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    outfp = open(outfile, 'w', encoding=encoding)
    device = TextConverter(rsrcmgr,
                           outfp,
                           laparams=laparams,
                           imagewriter=imagewriter)

    with open(fname, 'rb') as fp:
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(fp,
                                      pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            page.rotate = (page.rotate + rotation) % 360
            interpreter.process_page(page)
    device.close()
    outfp.close()
    return
Beispiel #34
0
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=LAParams())
    filepath = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pagenos = set()

    for page in PDFPage.get_pages(filepath,
                                  pagenos,
                                  maxpages=0,
                                  password='',
                                  caching=True,
                                  check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    filepath.close()
    device.close()
    retstr.close()
    return text
Beispiel #35
0
 def convert_pdf_to_text(self, fp=None):
     rsrcmgr = PDFResourceManager()
     retstr = StringIO()
     laparams = LAParams()
     device = TextConverter(rsrcmgr, retstr, laparams=laparams)
     fp = open(self.a, 'rb')
     interpreter = PDFPageInterpreter(rsrcmgr, device)
     password = ""
     maxpages = 0
     caching = True
     pagenos = set()
     
     for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True):
         interpreter.process_page(page)
         
     text = retstr.getvalue()
     
     fp.close()
     device.close()
     retstr.close()
     
     return text
def convert_pdf_to_txt(file):
    # input: a pdf file
    pdfFilePath = file
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(file, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()
    fp.close()
    device.close()
    retstr.close()

    # get pdf file name and then create a new text file to store the text content of a paper
    pdfFileName = os.path.basename(pdfFilePath)
    portion = os.path.splitext(pdfFileName)
    if portion[1] == ".pdf":
        txtFileName = portion[0] + ".txt"

    # write text into txtFileName and save to current directory()
    f = open(txtFileName, "w+")
    f.write(text)
    f.close()
    return txtFileName
Beispiel #37
0
def PDF_TO_TEXT(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    #Read through pages in PDF
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)
    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text
def extract_text_from_pdf(pdf_path):
    """
    Helper function to extract the plain text from .pdf files

    :param pdf_path: path to PDF file to be extracted
    :return: iterator of string of extracted text
    """
    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, caching=True,
                                      check_extractable=True):
            resource_manager = PDFResourceManager()
            fake_file_handle = io.StringIO()
            converter = TextConverter(resource_manager, fake_file_handle)
            page_interpreter = PDFPageInterpreter(resource_manager, converter)
            page_interpreter.process_page(page)

            text = fake_file_handle.getvalue()
            yield text

            # close open handles
            converter.close()
            fake_file_handle.close()
def convert(fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)
    infile = file(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close
    # lista = text.split('\n')
    # print lista
    # for i in range(0,len(lista)):
    #     print i, lista[i]
    return text
def pdf_to_text(fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close()
    result = []
    for line in text.split('\n'):
        result.append(line)
    return result
def pdfparser(filename):
    manager = PDFResourceManager()
    output = io.BytesIO()
    codec = 'utf-8'
    laparams = LAParams()

    converter = TextConverter(manager, output, codec=codec, laparams=laparams)

    # Create a PDF interpreter object.
    interpreter = PDFPageInterpreter(manager, converter)

    #pdf_file = open(file, 'rb')
    pdf_file = open(filename, 'rb')

    # Process each page contained in the document.
    for page in PDFPage.get_pages(pdf_file):
        interpreter.process_page(page)
    pdf_file.close()
    converter.close()
    text = output.getvalue()

    return text
def convert(fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    text_file = open("Resume.txt", "w", encoding="utf-8")
    text_file.write(text)
    output.close
    text_file.close()
    return text
def convert_pdf_to_txt(path):
    pdf_rsc_manager = PDFResourceManager()
    str_io = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(pdf_rsc_manager, str_io, codec=codec, laparams=laparams)
    pdf_file = open(path, 'rb')
    interpreter = PDFPageInterpreter(pdf_rsc_manager, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(pdf_file, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = str_io.getvalue()

    pdf_file.close()
    device.close()
    str_io.close()
    return text
Beispiel #44
0
def readPDF(pdffile):
    with open(pdffile,'rb') as fd:
        rsrcmgr=PDFResourceManager()
        retstr=StringIO()
        laparams=LAParams()
        device=TextConverter(rsrcmgr,retstr,laparams=laparams)
        process_pdf(rsrcmgr,device,fd)
        device.close()
        content=retstr.getvalue()
        retstr.close()
        strs = str(content).split('\n')
        for val in strs:
            if val == '':
                strs.remove(val)
        strs = "===".join(strs)
        strs = re.sub('===','<p>',strs)
        print(strs)
        return strs


# pdffile='d:/33.pdf'
# readPDF(pdffile)
def convert_pdf_to_txt(path):
    resourceManager = PDFResourceManager()
    returnstream = BytesIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(resourceManager, returnstream, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(resourceManager, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = returnstream.getvalue()

    fp.close()
    device.close()
    returnstream.close()
    return (text)
Beispiel #46
0
    def _convert_pdf_to_txt(self, pdf_path, page_list, codec='utf-8', password="",\
        maxpages=0, caching=True):
        """
        This is a functhion that extract all the text from a pdf file.

        Args:
            pdf_path (str): path of the pdf need to be processed
        """

        rsrcmgr = PDFResourceManager()
        retstr = io.StringIO()
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, \
            laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        pagenos = set()

        with open(pdf_path, 'rb') as fp:
            pages_objs_list = list(
                PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True))
            if page_list != None:
                pages_objs_list = [pages_objs_list[i] for i in page_list]
            for page in pages_objs_list:
                interpreter.process_page(page)

        text = retstr.getvalue()
        orig_text = text

        content = text.split('\n')
        content = [x.strip() for x in content if x.strip()]

        device.close()
        retstr.close()
        return orig_text, content
Beispiel #47
0
    def getPages(self, infile):
        outfile = infile.replace('pdf', 'txt')
        print(outfile)
        removeNoneLine = re.compile(r'\n[\s|]*\n')
        debug = 0
        pagenos = set()
        password = ''
        maxpages = 0
        rotation = 0
        codec = 'utf-8'  #输出编码
        caching = True
        imagewriter = None
        laparams = LAParams()
        #
        PDFResourceManager.debug = debug
        PDFPageInterpreter.debug = debug

        rsrcmgr = PDFResourceManager(caching=caching)
        outfp = file(outfile, 'w')  #pdf转换
        device = TextConverter(rsrcmgr,
                               outfp,
                               codec=codec,
                               laparams=laparams,
                               imagewriter=imagewriter)

        fp = file(infile, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)  #处理文档对象中每一页的内容
        for page in PDFPage.get_pages(fp,
                                      pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            #page.rotate = (page.rotate+rotation) % 360
            interpreter.process_page(page)
        fp.close()
        device.close()
        outfp.close()
        return
Beispiel #48
0
def pdf2txt():
    filextension = filename.split('.')[-1].lower()
    if filextension == "pdf":
        pi = 0
        pdfout = StringIO()
        pdfrm = PDFResourceManager()
        converter = TextConverter(pdfrm, pdfout, laparams=LAParams())
        interpreter = PDFPageInterpreter(pdfrm, converter)

        infile = open(filename, 'rb')
        for page in PDFPage.get_pages(infile):
            #仅检索前10页内�?
            if pi > 9:
                break
            interpreter.process_page(page)
            pi += 1
        infile.close()
        converter.close()
        text = pdfout.getvalue()
        pdfout.close

    print(text[0:2000])
Beispiel #49
0
def convert_pdf_to_txt(path, page_no=0):
    text = ""
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    for page in PDFPage.get_pages(fp,
                                  pagenos=[page_no],
                                  check_extractable=True):
        page_no += 1
        interpreter.process_page(page)
        text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()

    return text
Beispiel #50
0
def convert_pdf_to_txt(path):
    resource_manager = PDFResourceManager()
    return_string = StringIO()
    codec = 'utf-8'
    la_params = LAParams()
    device = TextConverter(resource_manager,
                           return_string,
                           codec=codec,
                           laparams=la_params)
    interpreter = PDFPageInterpreter(resource_manager, device)

    with open(path, 'rb') as file:
        for page in PDFPage.get_pages(file,
                                      caching=True,
                                      check_extractable=True):
            interpreter.process_page(page)

    text = return_string.getvalue()

    device.close()
    return_string.close()
    return text
Beispiel #51
0
def extractPDFText(pdfFilePath): #Returns text in the pdf
    with open(pdfFilePath , 'rb') as fh:
        completeText = ""
        for page in PDFPage.get_pages(fh , caching = True , check_extractable = True):
            resourceManager = PDFResourceManager()
            fakeFileHandle = io.StringIO()

            converter = TextConverter(resourceManager , fakeFileHandle)

            pageInterpreter = PDFPageInterpreter(resourceManager , converter)

            pageInterpreter.process_page(page)

            text = fakeFileHandle.getvalue()

            completeText += text
            completeText += " "

            converter.close()
            fakeFileHandle.close()

        return completeText
Beispiel #52
0
def extract_text_by_page(pdf_path):
    '''
    This fuction read a PDF document page by page using pdfmine library
    :param pdf_path: directory that contains a pdf file
    :return: It return a iterator, to read page by page the data
    '''
    with open(pdf_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh,
                                      caching=True,
                                      check_extractable=True):
            resource_manager = PDFResourceManager()
            fake_file_handle = io.StringIO()
            converter = TextConverter(resource_manager, fake_file_handle)
            page_interpreter = PDFPageInterpreter(resource_manager, converter)
            page_interpreter.process_page(page)

            text = fake_file_handle.getvalue()
            yield text

            # close open handles
            converter.close()
            fake_file_handle.close()
Beispiel #53
0
def convert_pdf_to_text(path):
    resource_manager = PDFResourceManager()
    return_string = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(resource_manager,
                           return_string,
                           codec=codec,
                           laparams=laparams)
    file_path = open(path, 'rb')
    interpreter = PDFPageInterpreter(resource_manager, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    filename = os.path.basename(path)

    #for page in PDFPage.get_pages(file_path, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
    try:
        log.info(f'Converting {filename}')
        for page in PDFPage.get_pages(file_path,
                                      pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=False):
            interpreter.process_page(page)

        result = return_string.getvalue()

        file_path.close()
        device.close()
        return_string.close()

        return result
    except Exception as ex:
        log.error(f'Exception of type {type(ex).__name__} thrown on: {path}')
        pass
def processPDF(file_path):
    #TODO: This method is generally too slow to be useful. Needs a rewrite (Add PDF as an allowed format in the SQL
    #TODO: query when done)

    error = None
    file_string = None
    try:
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        fp = file(file_path, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()

        for page in PDFPage.get_pages(fp,
                                      pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            print(page)
            interpreter.process_page(page)

        file_string = retstr.getvalue()

        print(file_string)

        fp.close()
        device.close()
        retstr.close()
    except Exception as e:
        error = e.message

    return [error, file_string]
Beispiel #55
0
def convert(fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = file(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close
    return text


#print(convert("sample3.pdf"))
Beispiel #56
0
def pdf2doc(pdfname):
    # PDFファイル名が未指定の場合は、空文字列を返して終了
    if (pdfname == ''):
        return ''
    else:
        # 処理するPDFファイルを開く/開けなければ
        try:
            fp = open(pdfname, 'rb')
        except:
            return ''

    # リソースマネージャインスタンス
    rsrcmgr = PDFResourceManager()
    # 出力先インスタンス
    outfp = StringIO()
    # パラメータインスタンス
    laparams = LAParams()
    # 縦書き文字を横並びで出力する
    laparams.detect_vertical = True
    # デバイスの初期化
    device = TextConverter(rsrcmgr, outfp, codec='utf-8', laparams=laparams)
    # テキスト抽出インタプリタインスタンス
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    # 対象ページを読み、テキスト抽出する。(maxpages:0は全ページ)
    for page in PDFPage.get_pages(fp,
                                  pagenos=None,
                                  maxpages=0,
                                  password=None,
                                  caching=True,
                                  check_extractable=True):
        interpreter.process_page(page)
    # 取得したテキストをすべて読みだす
    ret = outfp.getvalue()
    # 後始末をしておく
    fp.close()
    device.close()
    outfp.close()
    # 空白と改行をとりさり一塊のテキストとして返す
    return re.sub(r"\s| ", '', ret)
Beispiel #57
0
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = io.StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    maxpages = 0
    caching = True
    pagenos = set()
    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    string = retstr.getvalue()
    retstr.close()
    return string
Beispiel #58
0
def main(fname, output_f):
    # debug option
    debug = 0
    # input option
    password = ''
    pagenos = set()
    maxpages = 0
    # output option
    imagewriter = None
    rotation = 0
    codec = 'utf-8'
    caching = True
    laparams = LAParams()
    #
    PDFDocument.debug = debug
    PDFParser.debug = debug
    CMapDB.debug = debug
    PDFPageInterpreter.debug = debug
    #
    rsrcmgr = PDFResourceManager(caching=caching)
    outfp = open(output_f, 'w')
    device = TextConverter(rsrcmgr,
                           outfp,
                           codec=codec,
                           laparams=laparams,
                           imagewriter=imagewriter)
    fp = open(fname, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        page.rotate = (page.rotate + rotation) % 360
        interpreter.process_page(page)
    fp.close()
    device.close()
    outfp.close()
Beispiel #59
0
    def getChapterContents(self, chapterLink):
        log("PdfEBook: Getting chapter contents for %s" % chapterLink)

        # Create the set of pages that we want
        pagesRequired = set()

        # If we want the entire book, then use an empty set
        if chapterLink != 'ENTIRE_BOOK':
            # Check if there pages are a range of pages
            if '-' not in chapterLink:
                pagesRequired.add(int(chapterLink))
            else:
                pageRange = chapterLink.split('-')
                startPage = int(pageRange[0])
                endPage = int(pageRange[1])
                while startPage <= endPage:
                    pagesRequired.add(startPage)
                    startPage = startPage + 1

        chapterContent = ""

        try:
            output = StringIO()
            manager = PDFResourceManager()
            converter = TextConverter(manager, output, laparams=LAParams(), showpageno=False)
#            converter = HTMLConverter(manager, output, laparams=LAParams(), showpageno=False)
            interpreter = PDFPageInterpreter(manager, converter)

            infile = file(self.filePath, 'rb')
            for page in PDFPage.get_pages(infile, pagesRequired):
                interpreter.process_page(page)
            infile.close()
            converter.close()
            chapterContent = output.getvalue()
            output.close
        except:
            log("PdfEBook: Failed to read contents for %s in pdf %s with error: %s" % (chapterLink, self.filePath, traceback.format_exc()), xbmc.LOGERROR)

        return chapterContent
Beispiel #60
0
def pdf_to_text(pdfname):
    rsrcmgr = PDFResourceManager()  # used to handle interpreter and device
    output = StringIO()  # destination of interpreter processing
    codec = 'utf-8'
    laparams = LAParams()  # params layout
    device = TextConverter(rsrcmgr, output, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Extract text
    fp = open(pdfname, 'rb')
    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
    fp.close()

    # Get text from StringIO
    text = output.getvalue()

    # Cleanup
    device.close()
    output.close()

    return text