def pdf2html(pdfPath, htmlPath):
    '''按照tool中pdf2txt的方法,写的函数'''
    caching = True
    rsrcmgr = PDFResourceManager(caching=caching)
    scale = 1
    layoutmode = 'noraml'
    laparams = LAParams()
    outdir = None
    debug = False
    outfp = io.open(htmlPath, 'wt', encoding='utf-8', errors='ignore')
    device = HTMLConverter(rsrcmgr,
                           outfp,
                           scale=scale,
                           layoutmode=layoutmode,
                           laparams=laparams,
                           outdir=outdir,
                           debug=debug)
    pagenos = set()
    maxpages = 0
    password = ''
    fp = io.open(pdfPath, 'rb')
    process_pdf(rsrcmgr,
                device,
                fp,
                pagenos,
                maxpages=maxpages,
                password=password,
                caching=caching,
                check_extractable=True)
    fp.close()
    outfp.close()
def convertPDF(pdf_path, codec='ascii'):
    """
    Takes path to a PDF and returns the text inside it as string
    
    pdf_path: string indicating path to a .pdf file. Can also be a URL starting 
              with 'http'
    codec: can be 'ascii', 'utf-8', ...
    returns string of the pdf, as it comes out raw from PDFMiner
    """
    
    if pdf_path[:4] == 'http':
        print 'first downloading %s ...' % (pdf_path,)
        urllib.urlretrieve(pdf_path, 'temp.pdf')
        pdf_path = 'temp.pdf'
    
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    
    fp = file(pdf_path, 'rb')
    process_pdf(rsrcmgr, device, fp)
    fp.close()
    device.close()
    
    str = retstr.getvalue()
    retstr.close()
    
    return str
Example #3
0
def read_pdf(pdf):
    # resource manager
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    # device
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    process_pdf(rsrcmgr, device, pdf)
    device.close()
    content = retstr.getvalue()
    retstr.close()
    # 获取所有行
    lines = str(content).split("\n")

    units = [1, 2, 3, 5, 7, 8, 9, 11, 12, 13]
    header = '\x0cUNIT '
    # print(lines[0:100])
    count = 0
    flag = False
    # text = open('words.txt', 'w+')
    for line in lines:
        if line.startswith(header):
            flag = False
            count += 1
            if count in units:
                flag = True
                print(line)
Example #4
0
def pdf_to_text(pdf_string):
    """
    :param pdf_string: The PDF file contents.
    :return: A string with the content of the PDF file.
    """
    rsrcmgr = PDFResourceManager(caching=True)
    laparams = LAParams()

    output = StringIO.StringIO()
    device = TextConverter(rsrcmgr, output, codec='utf-8', laparams=laparams)

    document_io = StringIO.StringIO(pdf_string)
    pagenos = set()
    try:
        process_pdf(rsrcmgr,
                    device,
                    document_io,
                    pagenos,
                    check_extractable=False)
    except PDFSyntaxError:
        return u''

    device.close()
    output.seek(0)
    return output.read().decode('utf-8')
Example #5
0
def pdf_to_html(scraped_pdf_data): 
    from pdfminer.pdfinterp import PDFResourceManager, process_pdf 
    from pdfminer.pdfdevice import PDFDevice 
    from pdfminer.converter import HTMLConverter 
    from pdfminer.layout import LAParams 

    import StringIO 
    fp = StringIO.StringIO() 
    fp.write(scraped_pdf_data) 
    fp.seek(0) 
    outfp = StringIO.StringIO() 
    layoutmode='normal'
    scale=2
    charmargin=0.5
    linemargin=0.5
    wordmargin=0.3
    boxesflow=0

    rsrcmgr = PDFResourceManager() 
    device = HTMLConverter(rsrcmgr, outfp, layoutmode=layoutmode, scale=scale, laparams=LAParams(char_margin=charmargin, line_margin=linemargin, word_margin=wordmargin, boxes_flow=boxesflow)) 
    process_pdf(rsrcmgr, device, fp) 
    device.close() 

    t = outfp.getvalue() 
    outfp.close() 
    fp.close() 
    return t
Example #6
0
def read_file():
    read_360_ip = r.smembers("whitelist_360")
    list_ip = []
    f = open(cmd_path_spider, "at")
    with open(cmd_path + file_name, "rb") as my_pdf:
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, laparams=laparams)
        process_pdf(rsrcmgr, device, my_pdf)
        device.close()
        content = retstr.getvalue()
        retstr.close()
        for line in str(content).split("\n"):
            if not line:
                continue
            if '.' not in line:
                continue
            line = line.strip()
            if re.search('[a-z]', line):
                continue

            # 操作redis
            r.sadd("whitelist_360", line)
            list_ip.append(line)

        #print(list_ip)
        for ip in read_360_ip:
            ip = ip.decode(encoding='utf-8')
            if not ip in list_ip:
                r.srem("whitelist_360", ip)
                f.write(now_time + ": " + "360spider: " + ip + "\n")
        f.close()
Example #7
0
def pdfMine(fp):
    """
        Input: file handle to a PDF file
        Output: a list of tuples with (Primer Name, Primer Sequence)
    """
    str_io = StringIO.StringIO()
    laparams = LAParams()

    rsrcmgr = PDFResourceManager(caching=True)
    device = TextConverter(rsrcmgr, str_io, codec='utf-8', laparams=laparams)

    process_pdf(rsrcmgr,
                device,
                fp,
                pagenos=set(),
                maxpages=0,
                password='',
                caching=True,
                check_extractable=True)

    pdf_string = str_io.getvalue()
    blocks = pdf_string.split('VC00')
    primers = []
    for b in blocks:
        for x in re.findall('EA[\s\n\d\.\W]+(\D+[\w\d\s\S]+)\nUMO', b):
            primers.append(x.split('\n'))
    return primers
Example #8
0
def read_pdf(pdf):
    try:
        # resource manager
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        laparams = LAParams()
        # device
        device = TextConverter(rsrcmgr, retstr, laparams=laparams)
        process_pdf(rsrcmgr, device, pdf)
        device.close()
        content = retstr.getvalue()
        retstr.close()
        # 獲取所有行
        context = str(content).split("\n")

        #    print(context)

        for lines in context:
            if '@' in lines:
                return lines
                detail = lines.split(' ')
                for s in detail:
                    if '@' in s:
                        #print(s)
                        return s
    except:
        print('file error')
Example #9
0
    def fetch_past_legislator(self, year, chamber, url, name):
        name = name.replace('\n','')
        spaces = re.compile("\s+")
        name = spaces.sub(' ', name)

        url = "http://www.legislature.mi.gov/%s" % url.replace('../', '')
        with self.urlopen_context(url) as the_pdf:
    
            # UGH! What a useful yet convoluted library.
            outtext = StringIO()
            rsrc = PDFResourceManager(CMapDB())
            device = TextConverter(rsrc, outtext, codec='ascii', laparams=LAParams())
            process_pdf(rsrc, device, StringIO(the_pdf), set())
            outtext.seek(0)
            text = outtext.read()
            # I should just add a pdf_context that wraps this :-\
            
            res = re.findall(r'State\s+(?:Senator|Representative)\n(.*?)\n([R|D]).*?[\n]*(\d+)(?:st|nd|rd|th)', text)
            if res == []: 
                print text
                raise Exception("Some fragile code broke.") 
            name = res[0][0]
            (first, middle, last, suffix) = self.parse_name(name)
            leg = Legislator(year, chamber, res[0][2], name, first , last, middle, res[0][1], suffix=suffix)
            self.save_legislator(leg)
Example #10
0
    def __call__(self, stream):
        """Extract text from input stream"""
        # Prepare pdf extraction
        outfp = StringIO()
        rsrcmgr = PDFResourceManager(caching=self.caching)
        device = TextConverter(
                rsrcmgr,
                outfp,
                codec=self.encoding,
                laparams=self.laparams,
        )

        # Extract text
        process_pdf(
                rsrcmgr,
                device,
                stream,
                set(), # pagenos
                maxpages=0,
                password=self.password,
                caching=self.caching,
                check_extractable=True,
        )

        # Output
        text = outfp.getvalue()
        outfp.close()
        if self.normalize_spaces:
            return re.sub(r'  +', ' ', text)
        else:
            return text
Example #11
0
    def _convert_pdf_to_text(self, password=None):
    	input_pdf = self.cvFile
    	if password is not None:
	    self.cvFilePasswd = password
    	pagenos = range(0, 30)
    	maxpages = pagenos.__len__()
    	layoutmode = 'normal'
    	codec = 'utf-8'
    	scale = 1
    	outtype = 'txt'
    	laparams = LAParams()
    	laparams.all_texts = True
    	laparams.showpageno = True
    	outputPath = self.scratchDir
    	inputPath = os.getcwd()
    	if os.path.exists(input_pdf):
            inputPath = os.path.dirname(input_pdf)
    	input_filename = os.path.basename(input_pdf)
    	input_parts = input_filename.split(".")
    	input_parts.pop()
	randomStr = int(time.time())
    	output_filename = outputPath + os.path.sep + ".".join(input_parts) + randomStr.__str__() + r".txt"
	self.cvTextFile = output_filename
	outfp = file(output_filename, 'w')
    	rsrcmgr = PDFResourceManager()
    	device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    	fp = file(input_pdf, 'rb')
    	process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=self.cvFilePasswd, check_extractable=True)
    	fp.close()
    	device.close()
    	outfp.close()
    	return (0)
Example #12
0
def pdf_to_text(file_pointer):
    # debug option
    debug = 0

    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFDocument.debug = debug
    PDFParser.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug

    pagenos = set()
    password = ''
    maxpages = 0
    codec = 'utf-8'
    laparams = LAParams()
    rsrcmgr = PDFResourceManager()
    outfp = StringIO.StringIO()

    device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)

    process_pdf(rsrcmgr,
                device,
                file_pointer,
                pagenos,
                maxpages=maxpages,
                password=password)

    text_string = outfp.getvalue()

    outfp.close()
    device.close()

    return text_string
Example #13
0
def convert_pdf(path):
    from pdfminer.pdfparser import PDFDocument, PDFParser
    from pdfminer.pdfinterp import PDFResourceManager, process_pdf, PDFPageInterpreter
    from pdfminer.converter import TextConverter
    from pdfminer.layout import LAParams
    from cStringIO import StringIO
    from pdfminer.converter import PDFPageAggregator

    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()

    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

    fp = file(path, 'rb')
    parser = PDFParser(fp)

    process_pdf(rsrcmgr, device, fp)

    fp.close()
    device.close()

    text_str = retstr.getvalue()
    retstr.close()
    serialize_object(text_str, 'corpus.pkl')
    tokenized_text = tonkenier(text_str)

    serialize_object(tokenized_text, 'tokenized_corpus.pkl')
    return tokenized_text
Example #14
0
 def _convertpdf(self,filename):
 
     #try:
     if True:
         success = True
         pdfstr = ""
         
         if self.DEBUG:
             print "Converting PDF to text ..."
        
         rsrcmgr = PDFResourceManager()
         retstr = StringIO()
         codec = 'utf-8'
         laparams = LAParams()
         device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) 
         
         fp = file(filename, 'rb')
         process_pdf(rsrcmgr, device, fp)
         fp.close()
         device.close()
         pdfstr = retstr.getvalue()
         retstr.close()
         
         if self.DEBUG:
             print "PDF to text conversion complete."
     #except:
     #    success = False
     #    pdfstr = ""
     
     return pdfstr,success
Example #15
0
def read_pdf(pdf):
    # resource manager
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    # device
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    process_pdf(rsrcmgr, device, pdf)
    device.close()
    content = retstr.getvalue()
    retstr.close()
    # 获取所有行
    lines = str(content).split("\n")

    word_lst = []
    for stri in lines:

        match_pattern = re.findall(r'\b[a-z]{3,15}\b', stri)

        for word in match_pattern:
            # count = frequency.get(word, 0)
            # frequency[word] = count + 1
            word_lst.append(word)

        # print(word_lst)
    return word_lst
Example #16
0
def _pdf_to_text(path):

    try:
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'ascii'
        laparams = LAParams()
        laparams.all_texts = True
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

        with open(path, 'rb') as fp:
            process_pdf(rsrcmgr, device, fp)
            device.close()

            # fix the non-utf8 string ...
            result = retstr.getvalue()
            txt = result.encode('ascii','ignore')

            retVal = (txt,True)
            retstr.close()

    except Exception,e:
        #print str(e)
        #print "\tERROR: PDF is not formatted correctly, aborting."
        retVal = ("", False)
        pass
Example #17
0
def to_txt(pdf_path):
    input_ = file(pdf_path, 'rb')
    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    process_pdf(manager, converter, input_)
    return output.getvalue()
Example #18
0
def convertPDF(pdf_path, codec='ascii'):
    """
    Takes path to a PDF and returns the text inside it as string
    
    pdf_path: string indicating path to a .pdf file. Can also be a URL starting 
              with 'http'
    codec: can be 'ascii', 'utf-8', ...
    returns string of the pdf, as it comes out raw from PDFMiner
    """

    if pdf_path[:4] == 'http':
        print 'first downloading %s ...' % (pdf_path, )
        urllib.urlretrieve(pdf_path, 'temp.pdf')
        pdf_path = 'temp.pdf'

    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

    fp = file(pdf_path, 'rb')
    process_pdf(rsrcmgr, device, fp)
    fp.close()
    device.close()

    str = retstr.getvalue()
    retstr.close()

    return str
Example #19
0
 def readPDF(self, pdfFile):
     try:
         print(pdfFile)
         time.sleep(5)
         rsrcmgr = PDFResourceManager()  # Creates the resource manager
         # resource_mang = PDFResourceManager()
         retstr = StringIO(
         )  # string object for the representation of the pdf
         # string represetnation from string input and output module
         laparams = LAParams()  # Parameters Object Creation
         device = TextConverter(
             rsrcmgr, retstr,
             laparams=laparams)  # Creating the device for the conversion
         process_pdf(
             rsrcmgr, device, pdfFile
         )  # Process the specific pdf, to convert into string representations
         device.close()  # Closes the device.
         # print(retstr) # Debuggin
         # Decoded value is returned here UTF-8
         content = retstr.getvalue()  # gets the text from the string object
         # print(content)5
         return content  # Returns the content where its called
     except Exception as Ex:
         print(
             "While reading the file , there was an error in the function Readodf as :",
             Ex)  # printing the exception
def getPdfContent(pdfFile):
    input_ = file(pdfFile, 'rb')
    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    process_pdf(manager, converter, input_)
    return output.getvalue()
Example #21
0
def read_file(path):
    filename, file_extension = os.path.splitext(path)
    if file_extension == '.pdf':
        with open(path, "rb") as pdf:
            # resource manager
            rsrcmgr = PDFResourceManager()
            retstr = StringIO()
            laparams = LAParams()
            # device
            device = TextConverter(rsrcmgr, retstr, laparams=laparams)
            process_pdf(rsrcmgr, device, pdf)
            device.close()
            content = retstr.getvalue()
            retstr.close()

            return str(content)
    elif file_extension == '.doc':
        word = win32com.client.Dispatch("Word.Application")
        word.visible = False
        wb = word.Documents.Open(path)
        doc = word.ActiveDocument
        return doc.Range().Text
    elif file_extension == '.docx':
        temp = textract.process(path)
        return temp.decode()
Example #22
0
def convert(fp):
    showpageno = True

    pagenos = set()
    laparams = LAParams()
    rsrcmgr = PDFResourceManager(caching=False)
    retstr = StringIO2()
    retstr.encoding = 'utf-8'
    device = HTMLConverter(rsrcmgr,
                           retstr,
                           scale=1,
                           layoutmode='normal',
                           laparams=laparams,
                           outdir=None,
                           debug=False)

    process_pdf(rsrcmgr,
                device,
                fp,
                pagenos,
                maxpages=0,
                password='',
                caching=False,
                check_extractable=True)
    device.close()

    return retstr.getvalue()
Example #23
0
def pdf_to_text(pdf_file):
    password = ''
    pagenos = set()
    maxpages = 0
    caching = True
    laparams = LAParams()

    rsrcmgr = PDFResourceManager(caching=caching)

    with io.StringIO() as output:
        out_device = TextConverter(rsrcmgr, output, laparams=laparams)

        try:
            process_pdf(
                rsrcmgr,
                out_device,
                pdf_file,
                pagenos,
                maxpages=maxpages,
                password=password,
                caching=caching,
                check_extractable=True,
            )
        except PSEOF:
            raise ValueError("Invalid PDF")
        except PDFEncryptionError:
            raise ValueError("Bad encryption")

        return output.getvalue()
def getPdfContent(pdfFile):
    input_ = file(pdfFile, 'rb')
    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    process_pdf(manager, converter, input_)
    return output.getvalue()
Example #25
0
    def outToHtml(self, html):
        # pdf   File=file('C:/Python27/Scripts/PlentyReads/mybooks/uploads/4ABSLIST_OF_FIGABBREVATIONS.pdf', 'rb')
        # self.__pdf.getPath()
        # # path = settings.MEDIA_URL+Pdf.objects.get(id=Pdf.__getId(self.))
        # pdfFile=file(path ,'rb')
    

        BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
        MEDIA_ROOT = os.path.join(BASE_DIR,'mybooks\\')

        pdfFile=file(MEDIA_ROOT + str(self.__pdf.url), 'rb')
        rsrcmgr = PDFResourceManager(caching=self.caching)
        device = HTMLConverter(rsrcmgr, html, codec=self.codec, 
                               scale=self.scale,layoutmode=self.layoutmode, 
                               # laparams=self.laparams, outdir=self.outdir)
                               laparams=self.laparams)
        process_pdf(rsrcmgr, device, pdfFile, self.pagenos, maxpages=self.maxpages, password=self.password,
                          caching=self.caching, check_extractable=True)
        # PDFPage.get_pages(fp=pdfFile, maxpages=self.maxpages, 
        #                   pagenos=self.pagenos, password=self.password, caching=self.caching, 
        #                   check_extractable=True)
        pdfFile.close()
        # html.pdf=self.pdf
        
        return html
Example #26
0
def to_txt(infile: str, outfile: str):
    """
    Convert a pdf file to txt.
    :param infile: pdf file path;
    :param outfile: txt file path;
    :return: txt file path;
    """
    caching = True
    rsrcmgr = PDFResourceManager(caching=caching)
    codec = 'utf-8'
    pagenos = set()
    maxpages = 0
    password = ''
    laparams = LAParams()
    laparams.word_margin = float(0)
    laparams.line_margin = float(1)
    outfp = io.open(outfile, 'wt', encoding=codec, errors='ignore')
    device = TextConverter(rsrcmgr, outfp, laparams=laparams)
    fp = io.open(infile, 'rb')
    process_pdf(rsrcmgr,
                device,
                fp,
                pagenos,
                maxpages=maxpages,
                password=password,
                caching=caching,
                check_extractable=True)
    fp.close()
    device.close()
    outfp.close()
    return outfile
Example #27
0
def parse_pdf_to_txt(pdf_handle, write_file):
    pagenos = set()
    maxpages = 0
    codec = 'utf-8'
    caching = True
    laparams = LAParams()
    #laparams.all_texts = True
    laparams.detect_vertical = True

    # 创建pdf资源管理器 来管理共享资源
    rsrcmgr = PDFResourceManager(caching=caching)

    print("ready to open out file ........")
    with open(write_file, "wt", encoding=codec, errors='ignore') as outfp:
        device = XMLConverter(rsrcmgr, outfp, laparams=laparams)
        print("ready to converte pdf to xml ........")
        process_pdf(rsrcmgr,
                    device,
                    pdf_handle,
                    pagenos,
                    maxpages=maxpages,
                    password='',
                    caching=caching,
                    check_extractable=True)
        device.close()
Example #28
0
 def getTexts(self):
     try:
         password = ''
         pagenos = set()
         maxpages = 0
         codec = 'gb2312'
         caching = True
         laparams = LAParams()
         rsrcmgr = PDFResourceManager(caching=caching)
         outfp = file('temppdf.txt', 'w')
         device = TextConverter(rsrcmgr,
                                outfp,
                                codec=codec,
                                laparams=laparams)
         fname = self.fname
         fp = file(fname, 'rb')
         process_pdf(rsrcmgr,
                     device,
                     fp,
                     pagenos,
                     maxpages=maxpages,
                     password=password,
                     caching=caching,
                     check_extractable=True)
         fp.close()
         device.close()
         outfp.close()
         infp = file('temppdf.txt', 'rb')
         test = infp.read()
         infp.close()
         os.remove('temppdf.txt')
         self.text = test
         return "ok"
     except Exception, e:
         return e
Example #29
0
    def dump(self, pdffilename):
        ret = None
        rsrc = PDFResourceManager()
        outfp = cStringIO.StringIO()
        try:
            device = TextConverter(rsrc,
                                   outfp,
                                   codec='utf-8',
                                   laparams=LAParams())
            try:
                fp = file(pdffilename, 'rb')
                try:
                    process_pdf(rsrc,
                                device,
                                fp,
                                set(),
                                maxpages=0,
                                password='')
                    ret = outfp.getvalue()
                finally:
                    fp.close()
            finally:
                device.close()
        finally:
            outfp.close()

        return ret
Example #30
0
	def getTexts(self):
		try:
			password =''
			pagenos = set()
			maxpages = 0
			codec = 'utf-8'
			caching = True
			laparams = LAParams()
			rsrcmgr = PDFResourceManager(caching=caching)
			outfp = file('temppdf.txt','w')
			device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
			fname= self.fname
			fp = file(fname, 'rb')
			process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True)
			fp.close()
			device.close()
			outfp.close()
			infp = file('temppdf.txt','rb')
			test=infp.read()
			infp.close()
			os.remove('temppdf.txt')
			self.text=test
			return "ok"
		except Exception,e:
			return e
Example #31
0
    def __call__(self, stream):
        """Extract text from input stream"""
        # Prepare pdf extraction
        outfp = StringIO()
        rsrcmgr = PDFResourceManager(caching=self.caching)
        device = TextConverter(
                rsrcmgr,
                outfp,
                codec=self.encoding,
                laparams=self.laparams,
        )

        # Extract text
        process_pdf(
                rsrcmgr,
                device,
                stream,
                set(), # pagenos
                maxpages=0,
                password=self.password,
                caching=self.caching,
                check_extractable=True,
        )

        # Output
        text = outfp.getvalue()
        outfp.close()
        if self.normalize_spaces:
            return re.sub(r'  +', ' ', text)
        else:
            return text
Example #32
0
def convert(src, des):
    for root, dirs, files in os.walk(src):
        for file in files:
            try:
                if file.endswith(".pdf"):
                    if not file.startswith("._"):
                        outfile = des
                        codec = 'utf-8'
                        caching = True
                        rsrcmgr = PDFResourceManager(caching=caching)
                        if outfile:
                            outfp = open(outfile, 'wt', encoding=codec, errors='ignore')
                            close_outfp = True
                        else:
                            outfp = sys.stdout
                            close_outfp = False
                        device = TextConverter(rsrcmgr, outfp)
                        fname = os.path.join(root, file)
                        fp = open(fname, 'rb')
                        process_pdf(rsrcmgr, device, fp, check_extractable=True)
                        fp.close()
                        device.close()
                        if close_outfp:
                            outfp.close()
                        test=open(outfile).read()
                        invoice=find_invoice_number(test)
                        date= find_date(test)
                        due_amount = find_amount(test)
                        print("{ File Name: ", file, "Invoice Number: ", invoice, "Invoice Date: ", date, "Due Amount: Rs ", due_amount,"}")
            except:
                print('An error occured.')
Example #33
0
    def get(self):
        self.response.headers['Content-Type'] = 'application/javascript'
        ISBN = self.request.get('ISBN')
        callback = self.request.get('callback')
        book = Books.get_by_key_name(ISBN)
        path = book.FilePath
        rsrcmgr = PDFResourceManager()
        retstr = StringIO()
        codec = 'utf-8'
        laparams = LAParams()
        device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
        fp = file(path, 'rb')
        process_pdf(rsrcmgr, device, fp)
        fp.close()
        device.close()
        content = retstr.getvalue()
        retstr.close()
	content = content.split('\n')
        self.response.write(callback + '({ "content" : [')
	c = 0;
	for string in content:
	    if c == 0:
	        c = 1
	    else:
	        self.response.write(",\n")
	    self.response.write(' "' + string + '"')
	self.response.write(']})')
Example #34
0
def pdf_to_text(file_pointer):
    # debug option
    debug = 0
    
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFDocument.debug = debug
    PDFParser.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug

    pagenos = set()
    password = ''
    maxpages = 0
    codec = 'utf-8'
    laparams = LAParams()
    rsrcmgr = PDFResourceManager()
    outfp = StringIO.StringIO()
    
    device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    
    process_pdf(rsrcmgr, device, file_pointer, pagenos, maxpages=maxpages, password=password)

    text_string = outfp.getvalue()
    
    outfp.close()
    device.close()
    
    return text_string
Example #35
0
def main(argv):
    debug = 0
    password = ''
    pagenos = set()
    maxpages = 0
    outfile = None
    outtype = None
    codec = 'utf-8'
    pageno = 1
    caching = True
    laparams = LAParams()
    CMapDB.debug = debug
    PDFResourceManager.debug = debug
    PDFPageInterpreter.debug = debug
    PDFDevice.debug = debug
    rsrcmgr = PDFResourceManager(caching=caching)
    #outfp = sys.stdout
    test=""
    outfp = test
    device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
		
    fname="test.pdf"
    fp = file(fname, 'rb')
    process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True)
    fp.close()
    device.close()
    #outfp.close()
    print test
    return
Example #36
0
def pdf2xml(infile):
    '''
    Return a string of XML representation for given PDF file handle.
    Uses pdfminer to do the conversion and does some final post-processing.
    '''

    outfile = StringIO()

    # Empirically determined...
    laparams = LAParams()
    laparams.char_margin = 0.4

    # See pdf2txt.py
    rsrcmgr = PDFResourceManager(caching=False)
    device = XMLConverter(rsrcmgr, outfile, codec='utf-8', laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    if page_api:
        for page in PDFPage.get_pages(infile, set()):
            interpreter.process_page(page)
    else:
        process_pdf(rsrcmgr, device, infile, set())

    infile.close()
    return outfile.getvalue().replace("\n", "")
Example #37
0
def to_txt(pdf_path):
    input_ = open(pdf_path, 'rb')
    output = StringIO()
    manager = PDFResourceManager()
    converter = TextConverter(manager, output, laparams=LAParams())
    process_pdf(manager, converter, input_)
    return output.getvalue()
Example #38
0
def extract_from_pdf(file_name):
    # disable logging, because pdfminer produces a lot of warnings
    logger = logging.getLogger()
    logger.disabled = True

    f = open(file_name, "rb")
    laparams = LAParams()

    try:
        rsrcmgr = PDFResourceManager(caching=True)
        out = io.StringIO()
        device = TextConverter(rsrcmgr, out, laparams=laparams)
        process_pdf(rsrcmgr, device, f, set(), maxpages=1, check_extractable=True)
        s = unligaturify(str(out.getvalue()))
        out.close()

        tt = " ".join(s.replace("\n", " ").replace("  ", " ").split(" "))

        """ extract title """
        tmp = s.split("\n")[0:5]
        idx = tmp.index("")
        title = " ".join(tmp[0:idx])
        f.close()

        meta = {"title": title.strip(), "keywords": extract_key_words(tt)}
        return meta

    except Exception as e:
        lError(e)
        return {"title": "", "keywords": []}
Example #39
0
def __extract_extra__(request, item_id=None):
    if not request.user.is_authenticated():
        return HttpResponse('Please sign in first')

    from pdfminer.layout import LAParams
    from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
    from pdfminer.pdfdevice import PDFDevice, TagExtractor
    from pdfminer.converter import TextConverter
    from cStringIO import StringIO

    laparams = LAParams()
    outtype = 'text'
    laparams.char_margin = 1.0
    laparams.line_margin = 0.3
    laparams.word_margin = 0.2
    codec = 'utf-8'
    caching = True

    if item_id:
        all_items = Item.objects.filter(id=item_id)
    else:
        all_items = Item.objects.all()

    for item in all_items:

        # Don't extract if no PDF exists; or if we already have search text
        if not item.pdf_file or item.other_search_text:
            continue

        rsrcmgr = PDFResourceManager(caching=caching)
        outfp = StringIO()
        device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
        fp = item.pdf_file.file
        try:
            process_pdf(rsrcmgr,
                        device,
                        fp,
                        pagenos=set(),
                        maxpages=0,
                        password='',
                        caching=caching,
                        check_extractable=True)
        except AssertionError:
            logger.warning('FAILED in completely PDF index "%s"' % item.title)
            return HttpResponse('FAILED in completely PDF index "%s"' \
                                % item.title)
        else:
            logger.debug('Full PDF index of item "%s"' % item.title)
        finally:
            fp.close()
            device.close()
            outfp.seek(0)
            page_text = outfp.read()
            outfp.close()

            item.other_search_text = page_text
            item.save()

    return HttpResponse('Full PDF indexed for item "%s"' % item.title)
Example #40
0
def readPDF(bitfile):
    rsrcmgr     = PDFResourceManager()
    retstr      = StringIO()
    laparams    = LAParams()
    device      = TextConverter(rsrcmgr,retstr,laparams=laparams)
    process_pdf(rsrcmgr,device,BytesIO(bitfile))
    content     = retstr.getvalue()
    return content
Example #41
0
def pdf2text(filename):
    rsrcmgr = PDFResourceManager()
    device = TextExtractor(rsrcmgr)
    fp = io.open(filename, 'rb')
    process_pdf(rsrcmgr, device, fp)
    fp.close()
    device.close()
    return device.text
Example #42
0
 def __call__(self, rev, contenttype=None, arguments=None):
     rsrcmgr = PDFResourceManager()
     device = UnicodeConverter(rsrcmgr, laparams=LAPARAMS)
     try:
         process_pdf(rsrcmgr, device, rev)
         return device.read_result()
     finally:
         device.close()
Example #43
0
 def __call__(self, rev, contenttype=None, arguments=None):
     rsrcmgr = PDFResourceManager()
     device = UnicodeConverter(rsrcmgr, laparams=LAPARAMS)
     try:
         process_pdf(rsrcmgr, device, rev)
         return device.read_result()
     finally:
         device.close()
Example #44
0
 def parse(self):
     self.fp = open(self.filename, 'rb')
     self.rsrcmgr = PDFResourceManager(caching=caching)        
     self.device = HTMLConverter(self.rsrcmgr, self.outfp, codec=codec, scale=scale,
                            layoutmode=layoutmode, laparams=laparams, outdir=outdir)
     process_pdf(self.rsrcmgr, self.device, self.fp, pagenos, maxpages=maxpages, password=password,
                 caching=caching, check_extractable=True)
     self.outfp.seek(0)
     return BeautifulSoup.BeautifulSoup("".join(self.outfp.readlines()))
Example #45
0
 def extract_content(fp, encoding):
     content = StringIO()  # not BytesIO
     rsrcmgr = PDFResourceManager(caching=True)
     device = TextConverter(rsrcmgr, content)
     pagenos = set()
     process_pdf(rsrcmgr, device, fp, pagenos)
     device.close()
     content.seek(0)
     return content.getvalue().encode('utf-8')
Example #46
0
def get_pdf_io(pdfname,logger=None):
  """ pdf2txt and return a StringIO
  """

  if(logger is None):
    logger=createLog(logname="util")#,level=loglevel)

  from pdfminer.pdfparser import PDFDocument, PDFParser
  from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, process_pdf
  from pdfminer.pdfdevice import PDFDevice 
  from pdfminer.converter import TextConverter
  from pdfminer.cmapdb import CMapDB
  from pdfminer.layout import LAParams

  # debug option
  debug = 0
  # input option
  password = ''
  pagenos = set()
  maxpages = 0
  # output option
  #outfile = None
  #outtype = None
  #outdir = None
  #layoutmode = 'normal'
  codec = 'utf-8'
  #pageno = 1
  #scale = 1
  caching = True
  #showpageno = True
  laparams = LAParams()

  PDFDocument.debug = debug
  PDFParser.debug = debug
  CMapDB.debug = debug
  PDFResourceManager.debug = debug
  PDFPageInterpreter.debug = debug
  PDFDevice.debug = debug
  #
  rsrcmgr = PDFResourceManager(caching=caching)
  #outtype = 'text'
  outfp = StringIO()
  #outfp = sys.stdout
  device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
  fp = file(pdfname, 'rb')
  process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, password=password,
        caching=caching, check_extractable=True)
  """
  x=outfp.getvalue()
  print len(x)
  print x
  """
  fp.close()
  device.close()
  #outfp.close()
  outfp.seek(0)
  return outfp
Example #47
0
 def _process(self, fp, device):
     process_pdf(self.resmgr,
                 device,
                 fp,
                 self.options.pagenos,
                 maxpages=self.options.maxpages,
                 password=self.options.password,
                 caching=self.options.caching,
                 check_extractable=True)
Example #48
0
def pdf2txt(fp, pagenos=set(), caching=True, codec = 'utf-8',
            password=''):
    outfp = cStringIO.StringIO()
    laparams = LAParams()
    rsrcmgr = PDFResourceManager(caching=caching)
    device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
    process_pdf(rsrcmgr, device, fp, pagenos, password=password,
                    caching=caching, check_extractable=True)
    return outfp.getvalue()
Example #49
0
	def to_xml(self, filename):
		src = file(filename+".pdf",'rb')
		out = file(filename+".xml", 'w')
		rsrc = PDFResourceManager()
		converter = XMLConverter(rsrc, out, codec='utf-8', laparams=LAParams())
		process_pdf(rsrc, converter, src, 0, maxpages=0, password='')
		src.close
		out.close
		converter.close
Example #50
0
    def run(self):
        rsrcmgr = PDFResourceManager(caching=self._caching)
        if not self._outtype:
            self._outtype = "text"
            if __name__ == "__main__":
                if self._outfile:
                    if self._outfile.endswith(".htm") or self._outfile.endswith(".html"):
                        self._outtype = "html"
                    elif self._outfile.endswith(".xml"):
                        self._outtype = "xml"
                    elif self._outfile.endswith(".tag"):
                        self._outtype = "tag"
        if __name__ == "__main__":
            if self._outfile:
                outfp = file(self._outfile, "w")
            else:
                outfp = sys.stdout
        else:
            from cStringIO import StringIO

            outfp = StringIO()
        if self._outtype == "text":
            device = TextConverter(rsrcmgr, outfp, codec=self._codec, laparams=self._laparams)
        elif self._outtype == "xml":
            device = XMLConverter(rsrcmgr, outfp, codec=self._codec, laparams=self._laparams)
        elif self._outtype == "html":
            device = HTMLConverter(
                rsrcmgr,
                outfp,
                codec=self._codec,
                scale=self._scale,
                layoutmode=self._layoutmode,
                laparams=self._laparams,
            )
        elif self._outtype == "tag":
            device = TagExtractor(rsrcmgr, outfp, codec=self._codec)
        else:
            return usage()
        for fname in self._args:
            fp = file(fname, "rb")
            process_pdf(
                rsrcmgr,
                device,
                fp,
                self._pagenos,
                maxpages=self._maxpages,
                password=self._password,
                caching=self._caching,
                check_extractable=True,
            )
            fp.close()
        device.close()
        if __name__ == "__main__":
            outfp.close()
        else:
            return outfp.getvalue()
Example #51
0
    def parse_pdf(self, test_parse=False):
        """
            Parse a PDF and return text contents as an array
        """

        dtpo_log("debug", "parsePDF sourceFile -> '%s'", self.source_file)

        # input options
        pagenos = set()
        maxpages = 0
        # output option
        codec = "utf-8"
        caching = True
        laparams = LAParams()
        laparams.char_margin = 8.0
        laparams.word_margin = 2.0

        rsrcmgr = PDFResourceManager(caching=caching)

        try:
            outfp = file(self.text_file, "w")
        except IOError as io_error:
            raise DTPOFileError(self.text_file, 0, str(io_error))

        try:
            fp = file(self.source_file, "rb")
        except IOError as io_error:
            raise DTPOFileError(self.source_file, 0, str(io_error))

        try:
            device = TextConverter(rsrcmgr, outfp, codec=codec, laparams=laparams)
            process_pdf(rsrcmgr, device, fp, pagenos, maxpages=maxpages, caching=caching, check_extractable=True)

        except PDFException as pdf_error:
            message = "Failed to parse file {0} -> {1}".format(self.source_file, str(pdf_error))
            raise DTPOFileError(self.source_file, 0, message)
        except Exception as exception:
            message = "Failed to parse PDF file Unknown exception {0} - > {1}".format(type(exception), str(exception))
            raise DTPOFileError(self.source_file, 0, message)

        fp.close()
        device.close()
        outfp.close()

        #   Got the PDF converted = now get it into an array
        self.file_array = []
        for line in open(self.text_file):
            self.file_array.append(line)

        #   Remove the last entry - it's always '\x0c'
        if len(self.file_array) > 0:
            del self.file_array[-1]

        #   Remove the outfile
        if not test_parse:
            os.remove(self.text_file)
def readPDF(pdfFile):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    laparams = LAParams()
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    process_pdf(rsrcmgr, device, pdfFile)
    device.close()
    content = retstr.getvalue()
    retstr.close()
    return content
Example #53
0
 def _process(self, fp, device):
     process_pdf( self.resmgr
                , device
                , fp
                , self.options.pagenos
                , maxpages=self.options.maxpages
                , password=self.options.password
                , caching=self.options.caching
                , check_extractable=True
                )
Example #54
0
def pdf_text(filename):
    try:
        text = io.StringIO()
        rsrc = PDFResourceManager()
        device = TextConverter(rsrc, text, laparams=LAParams())
        process_pdf(rsrc, device, open(filename, 'rb'), None, maxpages=1, password='')
        device.close()
        return text.getvalue()
    except (PDFSyntaxError, PDFTextExtractionNotAllowed, PSEOF):
        return ""
Example #55
0
def decodepdf(fp, debug = False):
    with StringIO() as outfp:
        rsrcmgr = PDFResourceManager()
        device = TextConverter(rsrcmgr, outfp)
        logging.disable(logging.WARNING)
        if debug: print("processing pdf begin ({0})".format(timestr()))
        process_pdf(rsrcmgr, device, fp)
        if debug: print("processing pdf ended ({0})".format(timestr()))
        logging.disable(logging.NOTSET)
        return outfp.getvalue()
def get_pdf_content(path):
    laparams = LAParams()
    rsrc = PDFResourceManager()
    outfp = StringIO()
    try:
        #TODO: detect the encoding of the PDF        
        device = TextConverter(rsrc, outfp, codec="cp1252", laparams=laparams)
        process_pdf(rsrc, device, codecs.open(path))
    except (PDFSyntaxError, PDFTextExtractionNotAllowed):
        print "Error processing PDF file: " + path
    return outfp.getvalue()
Example #57
0
 def convert(data):
     from pdfminer.pdfinterp import PDFResourceManager, process_pdf
     from pdfminer.converter import TextConverter
     from StringIO import StringIO
     pdfdata = StringIO(data)
     htmldata = StringIO()
     man = PDFResourceManager()
     conv = TextConverter(man, htmldata)
     process_pdf(man, conv, pdfdata)
     data = htmldata.seek(0) or htmldata.read()
     return data