Esempio n. 1
0
def process_image(content, language, noOCR=False, despeckle=False):
  if noOCR:
    logging.error("OCR disabled, no text available")
    return None
  from sh import tesseract, convert

  tmpFolder = tempfile.mkdtemp(prefix='imap-dms-ocr-tmp')

  logging.debug("Converting image in tmpfolder %s", tmpFolder)
  convert(convert_options, "-", tmpFolder+"/out.png", _in=content,
      _in_bufsize=10000)

  logging.debug("Running tesseract with language %s on file %s", 
      language, tmpFolder+"/out.png")
  tesseract(tmpFolder+"/out.png", tmpFolder+"/out", "-l",language)
  
  f = open(tmpFolder+"/out.txt", "r")
  content = unicode(f.read(), "utf-8")
  f.close()

  logging.debug("Found %d chars for this page", len(content))
   
  content=content.strip()

  if(len(content)==0):
    return None
  else:
    return content
    def crack(self,
              dilateiter=4,
              erodeiter=4,
              threshold=200,
              size=(155, 55),
              whitelist_chars=string.ascii_lowercase):  #Take all parameters
        ''':param whitelist_char: the characters to recognize'''
        resized = resizeImage(self.image,
                              (self.image.width * 6, self.image.height * 6))

        dilateImage(resized, dilateiter)
        erodeImage(resized, erodeiter)
        thresholdImage(resized, threshold, cv.CV_THRESH_BINARY)

        resized = resizeImage(resized, size)

        #Call the tesseract engine
        from tempfile import NamedTemporaryFile
        temp_img_file = NamedTemporaryFile(suffix='.jpg')
        temp_solution_file = NamedTemporaryFile()
        cv.SaveImage(temp_img_file.name, resized)
        tesseract(temp_img_file.name, temp_solution_file.name, '-c',
                  'tessedit_char_whitelist=' + whitelist_chars)
        ret = str(cat(temp_solution_file.name + '.txt'))
        import os
        os.unlink(temp_solution_file.name + '.txt')
        return ret
Esempio n. 3
0
def process_pdf(content, language, noOCR=False, noPDFText=False, despeckle=False):
  if noPDFText:
    logging.debug("pdftotext disabled")
    pdfText = ""
  else:
    from sh import pdftotext
    logging.debug("Extracting pdf contents using pdftotext")
    pdfText = unicode(pdftotext('-', '-', _in=content, _in_bufsize=10000))
    logging.debug("Extracted %d chars from the text", len(pdfText))

  if noOCR:
    logging.debug("OCR disabled, returning only pdf text")
  else:
    from sh import identify, tesseract, convert

    logging.debug("Starting OCR Operation")
    logging.debug("Extracing page numbers")

    pageNos = map(int,
        str(identify("-format", "%p ", "pdf:-", _in=content,_in_bufsize=10000)).\
            strip().split(' '))
    logging.debug("Found pages: %s", pageNos)
    
    allPages=u""
    for pageNo in pageNos:
      logging.debug("Processing page %d", pageNo)

      tmpFolder = tempfile.mkdtemp(prefix='imap-dms-ocr-tmp')

      co = convert_options if not despeckle else convert_options_despeckle
      logging.debug("Converting page to image in tmpfolder %s with options %s", tmpFolder, co)
      convert(co, "pdf:-[%d]" % (pageNo), tmpFolder+"/out.png",
           _in=content, _in_bufsize=10000)

      logging.debug("Running tesseract with language %s on file %s",
          language, tmpFolder+"/out.png")
      tesseract(tmpFolder+"/out.png", tmpFolder+"/out", "-l",language)
      
      f = open(tmpFolder+"/out.txt", "r")
      pageContent = unicode(f.read(), "utf-8")
      f.close()

      logging.debug("Found %d chars for this page", len(pageContent))
      allPages+=pageContent+u"\n"

      shutil.rmtree(tmpFolder)

    pdfText=pdfText.strip()+"\n\n\n"+allPages.strip()

  if(len(pdfText.strip())==0):
    logging.error("No text could be recognized")
    return None
  else:
    return pdfText
Esempio n. 4
0
def create_tiff_and_txt(pnms, language):
    language = language or DEFAULT_LANGUAGE
    tiffs = []
    texts = []
    for pnm in sorted(pnms):
        unpapered = '_' + pnm
        unpaper(pnm, unpapered)
        if os.path.exists(unpapered):
            os.remove(pnm)

        tiff = unpapered.replace('.pnm', '.tiff')
        convert(unpapered, tiff)
        if os.path.exists(tiff):
            os.remove(unpapered)
            txtfile = tiff.replace('.tiff', '')
            tiffs.append(tiff)
            texts.append(txtfile + '.txt')
            tesseract(tiff, txtfile, '-l', language)
    return tiffs, texts
 def crack(self,dilateiter=4, erodeiter=4, threshold=200, size=(155,55), whitelist_chars=string.ascii_lowercase): #Take all parameters
     ''':param whitelist_char: the characters to recognize'''
     resized = resizeImage(self.image, (self.image.width*6, self.image.height*6))
 
     dilateImage(resized, dilateiter)
     erodeImage(resized, erodeiter)
     thresholdImage(resized,threshold, cv.CV_THRESH_BINARY)
     
     resized = resizeImage(resized, size)
     
     #Call the tesseract engine
     from tempfile import NamedTemporaryFile
     temp_img_file = NamedTemporaryFile(suffix='.jpg') 
     temp_solution_file = NamedTemporaryFile() 
     cv.SaveImage(temp_img_file.name,resized)
     tesseract(temp_img_file.name, temp_solution_file.name, '-c', 'tessedit_char_whitelist='+whitelist_chars)
     ret = str(cat(temp_solution_file.name+'.txt'))
     import os
     os.unlink(temp_solution_file.name+'.txt')
     return ret