コード例 #1
0
ファイル: helpers.py プロジェクト: jramosdc/JobBot
def convert(url, pages=None):
    assert isinstance(url, basestring)
    assert pages == None or isinstance(pages, list)

    rscmng = PDFResourceManager()
    retstr = StringIO()
    device = TextConverter(rscmng, retstr, codec='utf-8', laparams=LAParams())
    web_page = urllib2.urlopen(urllib2.Request(url))
    fp = StringIO(web_page.read())
    interpreter = PDFPageInterpreter(rscmng, device)

    pdf_pages = PDFPage.get_pages(
        fp,
        set(pages if pages != None else []),
        maxpages=0,
        password='',
        caching=True,
        check_extractable=True
    )

    for page in pdf_pages:
        interpreter.process_page(page)

    result = retstr.getvalue()

    fp.close()
    web_page.close()
    device.close()
    retstr.close()

    return result
コード例 #2
0
ファイル: Parse.py プロジェクト: flahemade/TO52
    def run(path):
        print "Calling parser :%s" % path

        t0 = time.clock()

        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        fp = file(path, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()
        book = Book()
        i = 0
        for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching,
                                      check_extractable=True):
            page_tmp = Page()
            begin_page = len(retstr.getvalue())
            interpreter.process_page(page)
            page_tmp.text = retstr.getvalue()[begin_page:-1]
            book.pages.append(page_tmp)
        fp.close()
        device.close()
        retstr.close()
        print "Parsing in:", time.clock() - t0
        return book
コード例 #3
0
    def Parse(self):
        # 先看是否有 cache,以及日期是否夠新
        if not os.path.exists(parseCacheDir):
            os.makedirs(parseCacheDir)
        cacheFile = os.path.join(parseCacheDir, os.path.basename(self.pdfFileName) + '.cache')
        foundCache = (os.path.isfile(cacheFile) and \
                      os.path.getsize(cacheFile) > 0 and \
                      os.path.getmtime(cacheFile) > os.path.getmtime(self.pdfFileName))
        if (foundCache):
            fp = open(cacheFile, 'rb')
            self.RawData = pickle.load(fp)
            fp.close()
        else:
            fp = open(self.pdfFileName, 'rb')
            for page in PDFPage.get_pages(fp, None, maxpages=1):
                rsrcmgr = PDFResourceManager()
                device = PDFPageAggregator(rsrcmgr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                interpreter.process_page(page)
                layout = device.get_result()
                self.__readobj(layout._objs)
                for category in self.RawData.values():
                    self.__reverseYaxis(category, layout.bbox[3])
                cacheFp = open(cacheFile, 'wb')
                pickle.dump(self.RawData, cacheFp)
                cacheFp.close()
            fp.close()

        self.__calculateBoundary()
        self.__assignCharsAndLinesToCell()
        self.__processCells()
        return (self.effectiveFrom, self.__getResult())
コード例 #4
0
ファイル: iocp.py プロジェクト: born2c0de/ioc_parser
    def parse_pdf_pdfminer(self, f, fpath):
        try:
            laparams = LAParams()
            laparams.all_texts = True  
            rsrcmgr = PDFResourceManager()
            pagenos = set()

            if self.dedup:
                self.dedup_store = set()

            self.handler.print_header(fpath)
            page_num = 0
            for page in PDFPage.get_pages(f, pagenos, check_extractable=True):
                page_num += 1

                retstr = StringIO()
                device = TextConverter(rsrcmgr, retstr, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                interpreter.process_page(page)
                data = retstr.getvalue()
                retstr.close()

                self.parse_page(fpath, data, page_num)
            self.handler.print_footer(fpath)
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            self.handler.print_error(fpath, e)
コード例 #5
0
ファイル: util.py プロジェクト: ivoysey/scottbot-int-ex
def pdf_to_text(pdfname):
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
    from pdfminer.pdfpage import PDFPage
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams

    from cStringIO import StringIO

    # PDFMiner boilerplate
    rsrcmgr = PDFResourceManager()
    sio = StringIO()
    # codec = 'utf-8'
    codec = 'ascii'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, sio, codec=codec, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Extract text
    fp = file(pdfname, 'rb')
    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
    fp.close()

    # Get text from StringIO
    text = sio.getvalue()

    # Cleanup
    device.close()
    sio.close()

    return text
コード例 #6
0
ファイル: buildcorpus.py プロジェクト: jojokarlin/tandemdh
def pdfconvert(infullpath, file, outfullpath, pages=None):         #Handle PDF
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)
    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)
    pdffile = open(infullpath, 'rb')
    for page in PDFPage.get_pages(pdffile, pagenums):
        interpreter.process_page(page)
    pdffile.close()
    converter.close()
    txtfilename = file

    jpgfile = os.path.splitext(outfullpath)[0] + '.jpg'
    txtfile = os.path.splitext(outfullpath)[0] + '.txt'
    string.replace(txtfile, ' ', '_')
    string.replace(txtfile, '(', '_')
    string.replace(txtfile, ')', '_')
    text = output.getvalue()
    output.close
    temp = open(txtfile, 'w')
    temp.write (text)
    temp.close()

    imagemagick_string = 'convert ' + '"' + infullpath + '" "' + jpgfile + '"'
    os.system(imagemagick_string)
コード例 #7
0
ファイル: scraper.py プロジェクト: lkundrak/bftest
def pdf2xml(infile):
    '''
    Return a string of XML representation for given PDF file handle.
    Uses pdfminer to do the conversion and does some final post-processing.
    '''

    outfile = StringIO()

    # Empirically determined...
    laparams = LAParams()
    laparams.char_margin = 0.4

    # See pdf2txt.py
    rsrcmgr = PDFResourceManager(caching=False)
    device = XMLConverter(rsrcmgr, outfile, codec='utf-8', laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    if page_api:
        for page in PDFPage.get_pages(infile, set()):
            interpreter.process_page(page)
    else:
        process_pdf(rsrcmgr, device, infile, set())

    infile.close()
    return outfile.getvalue().replace("\n", "")
コード例 #8
0
def extract_text_from_pdf(pdf_filename):
    """
    Function to extract the text from pdf documents using pdfminer

    Parameters:
    -----------
    pdf_filename -- string
        File name of the pdf document as string

    Returns:
    --------
    extracted_text -- string
        Text extracted from pdf as string
    """

    resource_manager = PDFResourceManager()
    return_string = StringIO()
    la_params = LAParams()
    device = TextConverter(resource_manager, return_string, codec='utf-8', laparams=la_params)
    fp = file(pdf_filename, 'rb')
    interpreter = PDFPageInterpreter(resource_manager, device)
    page_nos = set()

    for page in PDFPage.get_pages(fp, page_nos):
        interpreter.process_page(page)
    fp.close()

    device.close()
    extracted_text = return_string.getvalue()
    return_string.close()

    return extracted_text
コード例 #9
0
ファイル: Pdf.py プロジェクト: orangeoval/scotus
    def extract_text(self):
        pdf_data = file(self.local_file, 'rb').read()
        pdf_stream = io.BytesIO(pdf_data)
        laparams = LAParams()
        resource_manager = PDFResourceManager(caching=True)
        output_type = 'text'
        codec = 'utf-8'
        output_stream = io.BytesIO()
        pagenos = set()

        device = TextConverter(
            resource_manager,
            output_stream,
            codec=codec,
            laparams=laparams,
        )

        interpreter = PDFPageInterpreter(
            resource_manager,
            device,
        )

        pages = PDFPage.get_pages(
            pdf_stream,
            pagenos,
            maxpages=0,
            caching=True,
            check_extractable=True,
        )

        for page in pages:
            interpreter.process_page(page)

        self.text = output_stream.getvalue().decode('utf8')
コード例 #10
0
ファイル: Tandem0.2.py プロジェクト: jojokarlin/tandemdh
def pdfconvert(infullpath, file, infolder, pages=None):         #Handle PDF
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)
    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)
    pdffile = open(infullpath, 'rb')
   # print "pdffile=", pdffile
    for page in PDFPage.get_pages(pdffile, pagenums):
        interpreter.process_page(page)
    pdffile.close()
    converter.close()
    txtfilename = file
    jpgfile = infolder + str(txtfilename) + '.jpg'
    txtfile = corpuspath + corpusfolder + '/' + txtfilename + '.txt'

    text = output.getvalue()
    output.close
    temp = open(txtfile, 'w')
    temp.write (text)
    temp.close()

    imagemagick_string = 'convert ' + '"' + infullpath + '" "' + jpgfile + '"'
    os.system(imagemagick_string)

    return jpgfile
コード例 #11
0
ファイル: PDF_scraping.py プロジェクト: LoganWalls/pane
def pdf_from_url_to_txt(url, maxpages=0):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    # Open the url provided as an argument to the function and read the content
    f = urllib2.urlopen(urllib2.Request(url)).read()
    # Cast to StringIO object
    fp = StringIO(f)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    caching = True
    pagenos = set()
    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    string = retstr.getvalue()
    retstr.close()
    return string
コード例 #12
0
ファイル: indexMaker.py プロジェクト: young/pdf-index-maker
def get_pdf_text(path):
    """ Reads a pdf file and returns a dict of the text where the
        index represents the page number.
        http://stackoverflow.com/a/20905381
    """
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    # change to to utf-8 if the text comes out garbled
    codec = 'ascii'
    #codec = 'utf-8'
    laparams = LAParams()
    pages = {}
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams, showpageno=True, pages=pages)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    retstr.close()
    return pages
コード例 #13
0
ファイル: application.py プロジェクト: NamiKuro/Readsy
def convert_pdf_to_txt(path):
	rsrcmgr = PDFResourceManager()
	retstr = StringIO()
	codec = 'utf-8'
	laparams = LAParams()

	device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
	fp = file(path, 'rb')
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	password = ""
	maxpages = 120
	caching = True
	pagenos=set()
	# print "two"

	for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
		interpreter.process_page(page)
	# print "one"

	try:
		fp.close()
		device.close()
		str = retstr.getvalue()
		retstr.close()
	except:
		str = retstr.getvalue()

	return str
コード例 #14
0
ファイル: pdf_to_txt.py プロジェクト: heldergg/dre
def convert_pdf_to_txt(path):
    """
    Converts PDF to text using the pdfminer library
    """
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = "utf-8"
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    file_handle = file(path, "rb")
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(
        file_handle, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True
    ):
        interpreter.process_page(page)

    text = retstr.getvalue()

    file_handle.close()
    device.close()
    retstr.close()
    return text
コード例 #15
0
ファイル: pdf_scanner.py プロジェクト: george-ayris/teach
def convert_pdf_to_txt(path): 
	## TAKEN FROM STACK OVERFLOW
	## see... http://www.unixuser.org/~euske/python/pdfminer/programming.html for tutorial
	## Also see... https://github.com/dpapathanasiou/pdfminer-layout-scanner/blob/master/layout_scanner.py
	rsrcmgr = PDFResourceManager()
	retstr = StringIO()
	codec = 'utf-8'
	laparams = LAParams()

	fp = file(path, 'rb')
	password = ""
	maxpages = 0
	caching = True
	pagenos=set()

	# Read text from pages
	device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)	
	interpreter = PDFPageInterpreter(rsrcmgr, device)	
	for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
		interpreter.process_page(page)
	str = retstr.getvalue()

	fp.close()
	device.close()
	retstr.close()

	return str
コード例 #16
0
ファイル: model.py プロジェクト: muranava/nn_search2
def pdf_read(pdf):
    """
    Use PDFMiner to extract text from pdf file.
    <PDFMiner even though more low-level but pretty good tool to read pdfs>

    Args:
        *pdf* (str) -- path to pdf file

    Returns:
        *text* (str) -- a text extracted from pdf

    """
    # initalizing objects
    res_manager = PDFResourceManager()
    strio = StringIO()
    lps = LAParams()
    device = TextConverter(res_manager, strio, codec='utf-8', laparams=lps)
    interpreter = PDFPageInterpreter(res_manager, device)
    # opening a pdf file with 'rb' mode for reading binary files
    pdf_file = file(pdf, 'rb')
    for page in PDFPage.get_pages(pdf_file, maxpages=0, password='',
                                  caching=True, check_extractable=True):
        interpreter.process_page(page)
    # finishing up
    pdf_file.close()
    device.close()
    text = strio.getvalue()
    strio.close()
    return text
コード例 #17
0
ファイル: pdf_scanner.py プロジェクト: george-ayris/teach
def get_layout(path):
	'''returns a list of every character in the document as well as its location'''

	rsrcmgr = PDFResourceManager()
	retstr = StringIO()
	codec = 'utf-8'
	laparams = LAParams()

	fp = file(path, 'rb')
	password = ""
	maxpages = 0
	caching = True
	pagenos=set()

	layout = []
	device = PDFPageAggregator(rsrcmgr, laparams=laparams)
	interpreter = PDFPageInterpreter(rsrcmgr, device)
	for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
		interpreter.process_page(page)
		layout.append(  device.get_result()  )
	fp.close()
	device.close()
	retstr.close()

	return layout
コード例 #18
0
    def convert_pdf_to_txt(self, path):
        """
        A very simple conversion function
        which returns text for parsing from PDF.

        path = The path to the file
        """
        try:
            rsrcmgr = PDFResourceManager()
            retstr = StringIO()
            codec = 'utf-8'
            laparams = LAParams()
            device = TextConverter(
                rsrcmgr, retstr, codec=codec, laparams=laparams)
            fp = file(path, 'rb')
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            password = ""
            maxpages = 0
            caching = True
            pagenos = set()
            for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching,
                                          check_extractable=True):
                interpreter.process_page(page)
            text = retstr.getvalue()
            fp.close()
            device.close()
            retstr.close()
            return text
        except Exception as e:
            text = ""
            return text
            self.logger.error(
                "Failed to PDF to text: " + str(e))
コード例 #19
0
def convert_pdf_to_txt(path, output):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()

    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()
    fp.close()
    device.close()
    retstr.close()

    f = open(output, 'wb')
    f.write(text)
    f.close()
    return text
def pdf2txt(path):
    '''
    Converts a given PDF to plain text in UTF8.
    '''

    try:
        rsrcMgr = PDFResourceManager()
        retStr = StringIO()
        codec = 'utf-8'
        laParams = LAParams()
        device = TextConverter(rsrcMgr, retStr, codec=codec, laparams=laParams)
        fp = file(path, 'rb')
        interpreter = PDFPageInterpreter(rsrcMgr, device)
        password = ""
        maxPages = 0
        caching = True
        pageNos=set()
        for page in PDFPage.get_pages(fp,pageNos,maxpages=maxPages,password=password,caching=caching,check_extractable=True):
            interpreter.process_page(page)
        fp.close()
        device.close()
        text = retStr.getvalue()
        retStr.close()

        return text
    except:
        return None
コード例 #21
0
def convert_pdf_to_txt(path):

    temp = os.path.splitext(path)

    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = "utf-8"
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, "rb")
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(
        fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True
    ):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()

    outputFile = temp[0] + ".txt"
    print outputFile

    ff = open(outputFile, "w")
    ff.write(text)
    ff.close()
コード例 #22
0
def convert_pdf_to_text(pdf_path):
    """
    Given a path to a local PDF file, this function extracts text from it.
    """
    process_id = os.getpid()
    resource_manager = PDFResourceManager()
    output = StringIO.StringIO()
    laparams = LAParams(detect_vertical=True)
    device = TextConverter(
        resource_manager,
        output,
        codec='utf-8',
        laparams=laparams
    )
    interpreter = PDFPageInterpreter(resource_manager, device)
    file_handler = file(pdf_path, 'rb')
    pages = PDFPage.get_pages(file_handler)

    for idx, page in enumerate(pages):
        print("Page " + str(idx + 1), end='\r')
        sys.stdout.flush()
        interpreter.process_page(page)
    print()

    data = output.getvalue()
    data = data.replace('\n', ' ')
    data = data.replace('\t', ' ')
    data = data.replace('\r', ' ')
    data = data.replace('\x0c', ' ')

    return data
コード例 #23
0
ファイル: functions.py プロジェクト: gitter-badger/RedSparrow
def pdf_to_text(pdf):
    pagenos = set()
    maxpages = 0
    # output option
    rotation = 0
    codec = 'utf-8'
    pageno = 1
    scale = 1
    caching = True
    showpageno = True
    laparams = LAParams()

    rsrcmgr = PDFResourceManager(caching=caching)
    outtype = 'text'
    retstr = BytesIO()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = pdf
    if isinstance(pdf, str):
        fp = open(pdf, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp, pagenos,
                                  maxpages=maxpages,
                                  caching=caching, check_extractable=True):
        page.rotate = (page.rotate+rotation) % 360
        interpreter.process_page(page)
    fp.close()
    device.close()
    result = retstr.getvalue()
    print(result)
    return result
コード例 #24
0
ファイル: utils_for_tests.py プロジェクト: chaabni/SmartElect
def extract_pdf_page(filename, page_number_or_numbers):
    """Given the name of a PDF file and the pages to extract, use PDFMiner to extract those
    pages and return them as XML (in utf-8 bytes).

    The param page_number_or_numbers can be a single page number or an iterable thereof.
    """
    # This code adapted from pdf2txt.py which is part of PDFMiner.
    # Here's the command line version of the code below --
    #    pdf2txt.py -p 1 -o expected.xml sample.pdf

    if is_iterable(page_number_or_numbers):
        page_numbers = page_number_or_numbers
    else:
        page_numbers = [page_number_or_numbers]

    f_out = StringIO.StringIO()
    laparams = LAParams()
    rsrcmgr = PDFResourceManager()
    device = XMLConverter(rsrcmgr, f_out, codec='utf-8', laparams=laparams)

    with open(filename, 'rb') as f_in:
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        for page in PDFPage.get_pages(f_in, page_numbers):
            interpreter.process_page(page)

    device.close()

    xml = f_out.getvalue()
    f_out.close()

    return xml
コード例 #25
0
ファイル: pdf2txt.py プロジェクト: mvels/biseminar
    def __convert(self, ifile, ofile=None):
        fp = file(ifile, 'rb')

        if ofile is None:
            outfp = StringIO.StringIO()
        else:
            outfp = file(ofile, 'wb')

        rsrcmgr = PDFResourceManager(caching=self.caching)
        device = TextConverter(rsrcmgr, outfp, codec=self.codec, laparams=self.laparams,
                               imagewriter=self.imagewriter)

        interpreter = PDFPageInterpreter(rsrcmgr, device)
        try:
            for page in PDFPage.get_pages(fp, self.pagenos,
                                          maxpages=self.maxpages, password=self.password,
                                          caching=self.caching, check_extractable=True):
                page.rotate = (page.rotate + self.rotation) % 360
                interpreter.process_page(page)
        except (PDFException, MemoryError) as e:
            print "Could not extract text {0}".format(e)
        fp.close()
        device.close()
        retval = None
        if ofile is None:
            retval = outfp.getvalue()

        outfp.close()
        return retval
コード例 #26
0
def pdf_to_txt(fichero_pdf,fichero_txt):    

    # Especificamos la configuracion de nuestro pdf
    password = ''
    pagenos = set()
    maxpages = 0

    imagewriter = None
    rotation = 0
    codec = 'utf-8'
    caching = True
    laparams = LAParams()

    # Estrablecemos el gestor
    rsrcmgr = PDFResourceManager(caching=caching)
       
    # Creamos el fichero de salida y lingamos el dispositivo que lo transforma
    outfp = file(fichero_txt, 'w')
    device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams, imagewriter=imagewriter)
    
    # Para cada pagina del fichero pdf vamos interpretandola mediante el dispositivo
    fp = file(fichero_pdf, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True):
        page.rotate = (page.rotate+rotation) % 360
        interpreter.process_page(page)
        
    # Cerramos los dispositivos abiertos
    fp.close()
    device.close()
    outfp.close()
    
    return 1
コード例 #27
0
ファイル: reader.py プロジェクト: hasiya/pdf_parse
def convert(fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = file(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        # a = page.contents[0].rawdata
        # print ('u', a)
        # print
        # splitData = a.split('\n')
        interpreter.process_page(page)
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close()
    # print ('u', text)
    # print
    # print(text)
    return text
コード例 #28
0
ファイル: pdfLib.py プロジェクト: n-witt/EconstorCorpus
    def pdf2txt(self, lowerBorder=-1, upperBorder=-1):
        """
        Returns the plain text of the document. If lowerBorder is an int number > -1, only
        page referring to this number will be returned. If lowerBorder and upperBorder are >-1
        and upperBorder > lowerBoder, the pages referring to that range will be returned.  
        """
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        fp = file(self.filename, 'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        if (lowerBorder==-1 and upperBorder==-1) or (lowerBorder>-1 and upperBorder=="max"):
            pagenos=set()
        elif lowerBorder > -1 and upperBorder==-1:
            #extract only a single page
            pagenos=set(range(lowerBorder, lowerBorder+1))
        elif lowerBorder==-1 or upperBorder==-1 or lowerBorder > upperBorder:
            raise ValueError("illegal parameter passed")
        else:
            pagenos=set(range(lowerBorder, upperBorder+1))

        for (pageno, page) in enumerate(PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True)):
            if pageno < lowerBorder and upperBorder == "max":
                continue
            interpreter.process_page(page)
        fp.close()
        device.close()
        s = retstr.getvalue()
        retstr.close()
        return s.decode('utf-8')
コード例 #29
0
ファイル: pdf2txt.py プロジェクト: n-witt/DataAnalyzer
def pdf_to_txt(path, lowerBorder=-1, upperBorder=-1):
   rsrcmgr = PDFResourceManager()
   retstr = StringIO()
   codec = 'utf-8'
   laparams = LAParams()
   device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
   fp = file(path, 'rb')
   interpreter = PDFPageInterpreter(rsrcmgr, device)
   password = ""
   maxpages = 0
   caching = True
   if lowerBorder==-1 and upperBorder==-1:
      pagenos=set()
   else:
      if lowerBorder==-1 or upperBorder==-1 or lowerBorder > upperBorder:
         raise ValueError("illegal parameter passed")
      else:
         pagenos=set(range(lowerBorder, upperBorder+1))
   for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
      interpreter.process_page(page)
   fp.close()
   device.close()
   s = retstr.getvalue()
   retstr.close()
   return s.decode('utf-8')
コード例 #30
0
def convert_pdf_to_txt(path):
    """
    This function converts a .pdf file to text
    @path: file path to .pdf document

    from: http://stackoverflow.com/questions/26494211/
    extracting-text-from-a-pdf-file-using-pdfminer-in-python/26495057#26495057

    """
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages,
                                  password=password, caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text
コード例 #31
0
ファイル: test.py プロジェクト: cc13ny/System-Projects
def conv_pdf2txt(path, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = StringIO()
    manger = PDFResourceManager()
    codec = 'ascii' #or 'utf-8'
    device = TextConverter(manger, output, codec=codec, laparams=LAParams())
    interpreter = PDFPageInterpreter(manger, device)

    infile = file(path, 'rb')

    for page in PDFPage.get_pages(infile, pagenums, caching=True, check_extractable=True):
        interpreter.process_page(page)

    txt = output.getvalue()

    infile.close()
    device.close()
    output.close()
    return txt
コード例 #32
0
ファイル: utils.py プロジェクト: ShiriBernat/FlyFiles
def convert(fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = open(fname, 'rb')
    try:
        for page in PDFPage.get_pages(infile, pagenums):
            interpreter.process_page(page)
    except:
        infile.close()
        raise ValueError('cannot convert pdf to text')
    infile.close()
    converter.close()
    text = output.getvalue()
    output.close
    return text
コード例 #33
0
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    #device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text
コード例 #34
0
def get_text(path):
    # возвращаемый список с текстом
    text_list = list()

    resource_manager = PDFResourceManager()
    fake_file_handle = io.StringIO()
    converter = TextConverter(resource_manager, fake_file_handle)
    page_interpreter = PDFPageInterpreter(resource_manager, converter)

    with open(path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, caching=True,
                                      check_extractable=True):
            page_interpreter.process_page(page)

        text = fake_file_handle.getvalue()

    converter.close()
    fake_file_handle.close()

    if text:
        text_list = text.split('. ')

    return text_list
コード例 #35
0
ファイル: app.py プロジェクト: rjeli/docfeed
def get_blurb():
    pdfs = glob.glob('/pdfs/*')
    if not pdfs:
        print >> sys.stderr, 'NO PDFS'
        return '', ''
    pdf = random.choice(pdfs)
    print >> sys.stderr, 'pdf:', pdf
    with open(pdf, 'rb') as f:
        parser = PDFParser(f)
        document = PDFDocument(parser)
        assert document.is_extractable
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        device = TextConverter(rsrcmgr,
                               retstr,
                               codec='utf-8',
                               laparams=LAParams())
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        pages = list(PDFPage.get_pages(f))
        pnum = random.randint(0, len(pages))
        interpreter.process_page(pages[pnum])
        txt = retstr.getvalue()
    return pdf.replace('pdfs', 'view') + '#page=' + str(pnum), txt[:100]
コード例 #36
0
ファイル: pdftotext.py プロジェクト: danish703/pdfparser
def convertPDFToText():
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    fp = open('test.pdf', 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()
    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    string = retstr.getvalue()
    retstr.close()
    return string
コード例 #37
0
    def do_import(self, results, filepath):
        buff = StringIO()
        fp = open(filepath, 'rb')

        laparams = LAParams()
        laparams.all_texts = True
        rsrcmgr = PDFResourceManager()
        pagenos = set()

        page_num = 0
        for page in PDFPage.get_pages(fp, pagenos, check_extractable=True):
            page_num += 1

            device = TextConverter(rsrcmgr, buff, codec='utf-8', laparams=laparams)
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            interpreter.process_page(page)

            buff.write("\n")

        results.investigation.update(import_text=buff.getvalue())

        fp.close()
        buff.close()
コード例 #38
0
def convert_pdf(target_fn):
    ''' Convert a pdf file into a string of text '''
    laparams = LAParams()
    laparams.all_texts = True
    laparams.detect_vertical = True

    resource_manager = PDFResourceManager(caching=True)
    output_fh = StringIO.StringIO()
    device = TextConverter(resource_manager,
                           output_fh,
                           codec='utf-8',
                           laparams=laparams,
                           imagewriter=None)
    interpreter = PDFPageInterpreter(resource_manager, device)

    with open(target_fn, 'rb') as f:
        for page in PDFPage.get_pages(f):
            interpreter.process_page(page)

    device.close()
    output_fh.seek(0)
    content = output_fh.read().decode('utf-8')
    return content
コード例 #39
0
def pdfparser(path: str) -> list:
    """
    Parse pdf file to list contains content grouped in tuples (xcor, ycor, text)
    :param path: pdf file path
    :return: pdf content. list with tuples: (x cor, y cor, text)
    """
    if not allowed_file(path):
        return 'Incorrect file.'
    fp = open(path, 'rb')
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    pages = PDFPage.get_pages(fp)
    page_content = []
    for page in pages:
        interpreter.process_page(page)
        layout = device.get_result()
        for lobj in layout:
            if isinstance(lobj, LTTextBox):
                x, y, text = lobj.bbox[0], lobj.bbox[3], lobj.get_text()
                page_content.append((x, y, text))
    return page_content
コード例 #40
0
def pdf_to_text(pdfname):
    # PDFMiner boilerplate

    rsrcmgr = PDFResourceManager()
    codec = 'utf-8'
    laparams = LAParams()
    # Extract text
    fp = open(pdfname, 'rb')
    no=1
    i=0
    text =""
    flag=False
    data = pd.DataFrame([], columns=['page', 'text'])
    for page in PDFPage.get_pages(fp):
        sio = StringIO()
        device = TextConverter(rsrcmgr, sio, codec=codec, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        interpreter.process_page(page)

        text = sio.getvalue()
        text = re.sub('[^A-Za-z0-9 \n]+', '', text)
        sio.close()
        tokenized = text.split()
        if len(tokenized) > 0 and tokenized[-1].isdigit() and len(tokenized[-1]) > 4:
            page_number = int(tokenized[-1])
            if flag==False and (page_number ==1 or page_number ==2):
                i=page_number
                flag=True
            if i>0:
               data = data.append({'page': i, 'text': text}, ignore_index=True)

            if i!=0:
                i+=1
    fp.close()
    # Cleanup
    device.close()
    return data
コード例 #41
0
def pdf_extractor3(path, vectors=False):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()
    creator = "Unknown"

    current_page_number = 1
    paragraph_repo = {}
    vector = {}
    Classified = False

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password, caching=caching, check_extractable=True):
        text = ''
        interpreter.process_page(page)

        text = retstr.getvalue()
        retstr.truncate(0)
        text = re.sub(u'(\u0000)', "", text)
        paragraph_repo[str(current_page_number)] = text
        if vectors:
            vector[str(current_page_number)] = vectorizer(text, lang=detect(text))
        current_page_number += 1

    fp.close()
    device.close()
    retstr.close()
    if vectors:
        return Classified, creator, paragraph_repo, vector
    else:
        return Classified, creator, paragraph_repo
コード例 #42
0
def get_page_analysis(infile, pageno, pscript5_mode):
    rman = pdfminer.pdfinterp.PDFResourceManager(caching=True)
    if pdfminer.__version__ < '20200402':
        # Workaround for https://github.com/pdfminer/pdfminer.six/issues/395
        disable_boxes_flow = 2
    else:
        disable_boxes_flow = None
    dev = TextPositionTracker(
        rman,
        laparams=LAParams(
            all_texts=True, detect_vertical=True, boxes_flow=disable_boxes_flow
        ),
    )
    interp = pdfminer.pdfinterp.PDFPageInterpreter(rman, dev)

    patcher = None
    if pscript5_mode:
        patcher = patch.multiple(
            'pdfminer.pdffont.PDFType3Font',
            spec=True,
            get_ascent=PDFType3Font__PScript5_get_ascent,
            get_descent=PDFType3Font__PScript5_get_descent,
            get_height=PDFType3Font__PScript5_get_height,
        )
        patcher.start()

    try:
        with Path(infile).open('rb') as f:
            page = PDFPage.get_pages(f, pagenos=[pageno], maxpages=0)
            interp.process_page(next(page))
    except PDFTextExtractionNotAllowed as e:
        raise EncryptedPdfError() from e
    finally:
        if patcher is not None:
            patcher.stop()

    return dev.get_result()
コード例 #43
0
ファイル: pdf.py プロジェクト: illc-uva/open-raadsinformatie
def convert(fname, pages=None):
    if not pages:
        pagenums = set()
    else:
        pagenums = set(pages)

    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    interpreter = PDFPageInterpreter(manager, converter)

    infile = file(fname, 'rb')
    for page in PDFPage.get_pages(infile, pagenums):
        # For some reason PDFMiner chokes on pages that have a single,
        # but detailed image in them. Work around by skipping pages with
        # a large media box
        mediabox = [0, 0, 0, 0]
        try:
            mediabox = page.mediabox
        except AttributeError as e:
            pass
        try:
            mediabox_pixels = mediabox[2] * mediabox[3]
        except IndexError as e:
            mediabox_pixels = 0

        if mediabox_pixels <= settings.PDF_MAX_MEDIABOX_PIXELS:
            print "Processing page %s" % (page, )
            interpreter.process_page(page)
        else:
            print "Skipped page %s" % (page, )

    infile.close()
    converter.close()
    text = output.getvalue()
    output.close()
    return text
コード例 #44
0
def convert_pdf(input_file, format='text', codec='utf-8'):
    """Convert PDF file to text or html.

    Args:
        input_file (str): Input PDF file.
        format (str): Format text or html.
        codec (str): Codec for encode the text.

    Returns:
        str: Return text or html from PDF file.

    """
    manager = PDFResourceManager()
    output = BytesIO()
    laparams = LAParams()
    if format == 'text':
        converter = TextConverter(manager,
                                  output,
                                  codec=codec,
                                  laparams=laparams)
    elif format == 'html':
        converter = HTMLConverter(manager,
                                  output,
                                  codec=codec,
                                  laparams=laparams)

    with open(input_file, 'rb') as f1:
        interpreter = PDFPageInterpreter(manager, converter)
        for page in PDFPage.get_pages(f1, caching=True,
                                      check_extractable=True):
            interpreter.process_page(page)

        converter.close()
        text = output.getvalue()
        output.close()

    return text.decode()
コード例 #45
0
def main(argv) :
    #输出文件名,这里只处理单文档,所以只用了argv[1]
    outfile = argv[1] + '.txt'
    args = [argv[1]]

    debug = 0
    pagenos = set()
    password = ''
    maxpages = 0
    rotation = 0
    codec = 'utf-8'   #输出编码
    caching = True
    imagewriter = None
    laparams = LAParams()
    #
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug

    rsrcmgr = PDFResourceManager(caching=caching)
    outfp = file(outfile,'w')
#pdf转换
    device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams,
                imagewriter=imagewriter)

    for fname in args:
        fp = file(fname,'rb')
        interpreter = PDFPageInterpreter(rsrcmgr, device)
#处理文档对象中每一页的内容
        for page in PDFPage.get_pages(fp, pagenos,
                          maxpages=maxpages, password=password,
                          caching=caching, check_extractable=True) :
            page.rotate = (page.rotate+rotation) % 360
            interpreter.process_page(page)
        fp.close()
    device.close()
    outfp.close()
    return
コード例 #46
0
def parse_pdf(path, print_lines=False):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    with open(path, 'rb') as fp:
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()

        for page in PDFPage.get_pages(fp,
                                      pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            interpreter.process_page(page)
        text = retstr.getvalue()

    device.close()
    retstr.close()

    u = text.split("\n")
    if print_lines:  # print line numbers
        it = iter(u)
        i = 0
        try:
            while True:
                print(i, next(it))
                i += 1
        except StopIteration:
            pass

    return u
コード例 #47
0
def Data(path):
    for paths in path:
        pdf = PyPDF2.PdfFileReader(open(paths, "rb"))
        file = open(paths, 'rb')
        num_of_pages = pdf.getNumPages()
        for i in range(num_of_pages):
            pages = [i]
            page_no = set(pages)
            manager = PDFResourceManager()
            io = StringIO()
            encoder = 'utf-8'
            params = LAParams()
            converter = TextConverter(manager,
                                      io,
                                      codec=encoder,
                                      laparams=params)
            interpreter = PDFPageInterpreter(manager, converter)
            password = ""
            maxpages = 0
            caching = True
            text = ""
            for page in PDFPage.get_pages(file,
                                          page_no,
                                          maxpages=maxpages,
                                          password=password,
                                          caching=caching,
                                          check_extractable=True):
                interpreter.process_page(page)
                text = io.getvalue()
                text = clean(text)
                if text != '' and len(text.split()) > 10:
                    db_client.local.PDFData.insert_one({
                        'data':
                        text,
                        'keyword':
                        paths.split('_')[0].lower()
                    })
コード例 #48
0
def pdf2txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()

    cwd = os.getcwd()
    hard_path=cwd


    [dummy,name]=os.path.split(path)
    print name
    name_txt=name[:-4]+'.txt'
    print name_txt
    path = os.path.join(os.path.normpath(hard_path),name_txt)
    print path
    text_file = open(os.path.normpath(path),'w')
    text_file.write(text)
    text_file.close()

    return text
コード例 #49
0
ファイル: autoarticle.py プロジェクト: ginger51011/hehe-maker
    def convert_pdf_to_txt(self):
        """Directly from stackoverflow, some edits.
        Converts PDF(s) to text if found
        """
        rsrcmgr = PDFResourceManager()
        retstr = io.StringIO()
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()

        for path in list(self.listings):
            try:
                if path.endswith(".pdf"):  # Cheks if this is a pdf file
                    fp = open(path, 'rb')
                    for page in PDFPage.get_pages(fp,
                                                  pagenos,
                                                  maxpages=maxpages,
                                                  password=password,
                                                  caching=caching,
                                                  check_extractable=True):
                        interpreter.process_page(page)

                    text = retstr.getvalue()

                    fp.close()
                    self.text = self.text + " " + text
            except:
                print(
                    "Error encountered when trying to parse PDF as text, skipping "
                    + path + "...")

        device.close()
        retstr.close()
コード例 #50
0
    def extract_text_from_pdf(path, filename):
        with open(path+filename, 'rb') as fh:
            # iterate over all pages of PDF document
            for page in PDFPage.get_pages(fh, caching=True, check_extractable=True):
                # creating a resoure manager
                resource_manager = PDFResourceManager()
               
                # create a file handle
                fake_file_handle = io.StringIO()
               
                # creating a text converter object
                converter = TextConverter(
                                    resource_manager,
                                    fake_file_handle,
                                    codec='utf-8',
                                    laparams=LAParams()
                            )
 
 
                # creating a page interpreter
                page_interpreter = PDFPageInterpreter(
                                    resource_manager,
                                    converter
                                )
 
 
                # process current page
                page_interpreter.process_page(page)
               
                # extract text
                text = fake_file_handle.getvalue()
                yield text
 
 
                # close open handles
                converter.close()
                fake_file_handle.close()
コード例 #51
0
def pdf_to_txt(path: str) -> list:
    """ PDF ファイル読み込み、パースしてテキストを返す

    Args:
        path (str): PDF ファイルのパス

    Returns:
        list: PDF をパースしたテキストを改行で区切ったリスト
    """

    resource_manager = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    laparams.detect_vertical = True  # Trueにすることで綺麗にテキストを抽出できる
    device = TextConverter(resource_manager, retstr, codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(resource_manager, device)
    maxpages = 0
    caching = True
    pagenos = set()

    fstr = ''
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, caching=caching, check_extractable=True):
        interpreter.process_page(page)

        str = retstr.getvalue()
        fstr += str
        break

    fp.close()
    device.close()
    retstr.close()

    list_text = fstr.split('\n')

    return list_text
コード例 #52
0
    def convert(self, fileName):
        logging.info("PDFConverter.convert STARTS")
        resourceManager = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(resourceManager,
                               retstr,
                               codec=codec,
                               laparams=laparams)

        filename = os.path.abspath(__file__ + '/../../../../../temp/' +
                                   fileName)

        fp = open(filename, "rb")
        interpreter = PDFPageInterpreter(resourceManager, device)
        password = ""
        maxpages = 0
        caching = True
        pagenos = set()

        for page in PDFPage.get_pages(fp,
                                      pagenos,
                                      maxpages=maxpages,
                                      password=password,
                                      caching=caching,
                                      check_extractable=True):
            interpreter.process_page(page)

        text = retstr.getvalue()

        fp.close()
        device.close()
        retstr.close()
        os.remove(filename)
        logging.info("PDFConverter.convert ENDS")
        return text
コード例 #53
0
def extract_from_pdf(file, file_path):
    text = ""
    '''
    if file_path is not None:
        text = textract.process(file_path, method='tesseract', language='eng')
	'''
    if text != "":
        return text
    '''
    else:
        pdfReader = PyPDF2.PdfFileReader(file)
        pagesCount = pdfReader.numPages
        for i in range(0, pagesCount):
            pageObj = pdfReader.getPage(i)
            text += pageObj.extractText()
    '''
    resource_manager = PDFResourceManager()
    fake_file_handle = io.StringIO()
    converter = TextConverter(resource_manager, fake_file_handle)
    page_interpreter = PDFPageInterpreter(resource_manager, converter)
    
    with open(file_path, 'rb') as fh:
        for page in PDFPage.get_pages(fh, 
                                      caching=True,
                                      check_extractable=True):
            page_interpreter.process_page(page)
            
        text = fake_file_handle.getvalue()
    
    # close open handles
    converter.close()
    fake_file_handle.close()
    
    if text:
        return text
    
    return text
コード例 #54
0
def extract_text_by_page(pdf_file, password='', page_numbers=None, maxpages=0,
                         caching=True, codec='utf-8', laparams=None):
    """
    Parse and return the text contained in each page of a PDF file. Taken from
    https://github.com/pdfminer/pdfminer.six/blob/master/pdfminer/high_level.py#L90-L123
    and adapted to return the text of each page separately as a dictionary obj.
    :param pdf_file: Either a file path or a file-like object for the PDF file
        to be worked on.
    :param password: For encrypted PDFs, the password to decrypt.
    :param page_numbers: List of zero-indexed page numbers to extract.
    :param maxpages: The maximum number of pages to parse
    :param caching: If resources should be cached
    :param codec: Text decoding codec
    :param laparams: An LAParams object from pdfminer.layout. If None, uses
        some default settings that often work well.
    :return: a dict containing the text from each page (keys = page numbers)
    """
    if laparams is None:
        laparams = LAParams()

    text_by_page = {}

    with open_filename(pdf_file, "rb") as fp:
        rsrcmgr = PDFResourceManager()
        pages_iterable = PDFPage.get_pages(fp, page_numbers, maxpages=maxpages, password=password, caching=caching)
        if page_numbers is None:
            tuples_iterable = enumerate(pages_iterable)
        else:
            tuples_iterable = zip(page_numbers, pages_iterable)
        for page_num, page in tuples_iterable:
            # print('Processing page_num', page_num)
            with StringIO() as output_string:
                device = TextConverter(rsrcmgr, output_string, codec=codec, laparams=laparams)
                interpreter = PDFPageInterpreter(rsrcmgr, device)
                interpreter.process_page(page)
                text_by_page[page_num] = output_string.getvalue()
    return text_by_page
コード例 #55
0
 def get_xml_data(self):
     """Store XML representation fo file"""
     rm = PDFResourceManager(caching=True,
                             font_correctors=self.font_correctors)
     laparams = LAParams()
     outfp = open(self.xmlfile, "wb")
     device = XMLConverter(rm,
                           outfp,
                           codec="UTF-8",
                           laparams=laparams,
                           imagewriter=None)
     interpreter = PDFPageInterpreter(rm, device)
     infile = open(self.pdffile, "rb")
     pagenos = set()
     maxpages = 0
     rotation = 0
     password = ""
     for page in PDFPage.get_pages(infile,
                                   pagenos,
                                   maxpages=maxpages,
                                   password=password,
                                   caching=True,
                                   check_extractable=True):
         page.rotate = (page.rotate + rotation) % 360
         interpreter.process_page(page)
     self.font_metrics = {}
     for font in list(rm._cached_fonts.values()):
         try:
             self.font_metrics[font.fontname] = {
                 "bbox": font.bbox,
                 "descent": font.descent
             }
         except AttributeError:
             print((dir(font)))
     infile.close()
     device.close()
     outfp.close()
コード例 #56
0
ファイル: utils.py プロジェクト: Rushikesh-1996/Rocket
def get_pdf_formatted_txt(path, pwd):
    """Extract the text from the PDF file.

    Parameters
    ----------
    path  :  string
        Filepath of the PDF.

    pwd  :  string
        Password for the encrypted PDF file.

    Returns
    -------
    text  :  string
        extracted string form PDF. 
    """
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = pwd
    maxpages = 0
    caching = True
    pagenos=set()

    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()

    fp.close()
    device.close()
    retstr.close()
    return text
コード例 #57
0
ファイル: jslib.py プロジェクト: shreea/Tech-Knights
def convert_pdf_to_txt(path):
    rsrcmgr = PDFResourceManager()
    retstr = BytesIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = open(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0
    caching = True
    pagenos = set()

    for page in PDFPage.get_pages(fp,
                                  pagenos,
                                  maxpages=maxpages,
                                  password=password,
                                  caching=caching,
                                  check_extractable=True):
        interpreter.process_page(page)

    text = retstr.getvalue()
    #    text1 = text.decode().split('\n')
    text1 = set(re.split("\n|\n\n|,", text.decode()))
    text1 = list(text1)
    text2 = []
    for i in range(0, len(text1)):
        st = text1[i]
        text2.append(st)
        listt = st.split(' ')
        text2.extend(listt)
    text1 = text2
    text1 = [x.lower() for x in text1]
    fp.close()
    device.close()
    retstr.close()
    return text1
コード例 #58
0
def pdf_to_text(pdfname):

    # PDFMiner boilerplate
    rsrcmgr = PDFResourceManager()
    sio = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = TextConverter(rsrcmgr, sio, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    # Extract text
    fp = open(pdfname, 'rb')
    for page in PDFPage.get_pages(fp):
        interpreter.process_page(page)
    fp.close()

    # Get text from StringIO
    text = sio.getvalue()

    # Cleanup
    device.close()
    sio.close()

    return text
コード例 #59
0
def request_pdf(url, case_id, court_name):
    try:
        response = requests.request("GET", url, proxies=proxy_dict)
        if response.status_code == 200:
            res = response.text

            if "no data found" in res.lower():
                logging.error("No data for: " + str(case_id))
                return "NULL"

            file_path = module_directory + "/../Data_Files/PDF_Files/" + court_name + "_" + slugify(case_id) + ".pdf"
            fw = open(file_path, "wb")
            fw.write(response.content)

            text_data = ""

            pdf_manager = PDFResourceManager()
            string_io = StringIO()
            pdf_to_text = TextConverter(pdf_manager, string_io, codec='utf-8', laparams=LAParams())
            interpreter = PDFPageInterpreter(pdf_manager, pdf_to_text)
            for page in PDFPage.get_pages(open(file_path, 'rb')):
                interpreter.process_page(page)
                text_data = string_io.getvalue()

            file_path = module_directory + "/../Data_Files/Text_Files/" + court_name + "_" + slugify(case_id) + ".txt"
            fw = open(file_path, "w")
            fw.write(str(text_data))

            return str(text_data)
        else:
            logging.error("Failed to get text file for: " + str(case_id))
            return "NULL"

    except Exception as e:
        logging.error("Failed to get pdf file for: " + str(case_id) + ". Error: %s", e)
        return "NULL"
コード例 #60
0
 def read_pdf(self, path):
     rsrcmgr = PDFResourceManager()
     retstr = io.StringIO()
     device = TextConverter(rsrcmgr, retstr)
     fp = open(path, 'rb')
     interpreter = PDFPageInterpreter(rsrcmgr, device)
     password = ""
     maxpages = 0
     caching = True
     pagenos = set()
     for page in PDFPage.get_pages(fp,
                                   pagenos,
                                   maxpages=maxpages,
                                   password=password,
                                   caching=caching,
                                   check_extractable=True):
         interpreter.process_page(page)
     text = retstr.getvalue()
     text = " ".join(text.replace(u"\xa0", " ").strip().split())
     text = text.replace('\uf0b7', '').lower()
     fp.close()
     device.close()
     retstr.close()
     return text