def ocr_image(cache, url, codelang): # This is checked in bot_listening but must be redone here, so if # the ocr for the same page is asked multiple time, we will do the ocr # only once. text = get_from_cache(cache, url, codelang) if text: return ret_val(0, text) url = url.encode('utf-8') cache_key = image_key(url) lang = ocr.tesseract_languages.get(codelang, 'eng') basename = os.path.expanduser('~/tmp') + '/tesseract/image_%s' % cache_key image_filename = basename + ".jpg" utils.copy_file_from_url(url, image_filename) if not os.path.exists(image_filename): return ret_val(1, "could not download url: %s" % url) text = ocr.ocr(image_filename, basename, lang) if text == None: return ret_val(2, "ocr failed") os.remove(image_filename) if os.path.exists(basename + ".txt"): os.remove(basename + ".txt") cache.set(cache_key, text) return ret_val(0, text)
def extract_djvu_text(url, filename, sha1): print "extracting text layer" if type(filename) == type(u''): filename = filename.encode('utf-8') utils.copy_file_from_url(url, filename, sha1) data = [] # GTK app are very touchy os.environ['LANG'] = 'en_US.UTF8' # FIXME: check return code ls = subprocess.Popen(['djvutxt', filename, '--detail=page'], stdout=subprocess.PIPE, close_fds=True) text = ls.stdout.read() ls.wait() for t in re.finditer( u'\((page -?\d+ -?\d+ -?\d+ -?\d+[ \n]+"(.*)"[ ]*|)\)\n', text): t = unicode(t.group(1), 'utf-8', 'replace') t = re.sub(u'^page \d+ \d+ \d+ \d+[ \n]+"', u'', t) t = re.sub(u'"[ ]*$', u'', t) t = unquote_text_from_djvu(t) data.append(t) os.remove(filename) return sha1, data
def extract_djvu_text(url, filename, sha1): print "extracting text layer" if type(filename) == type(u''): filename = filename.encode('utf-8') utils.copy_file_from_url(url, filename, sha1) data = [] # GTK app are very touchy os.environ['LANG'] = 'en_US.UTF8' # FIXME: check return code ls = subprocess.Popen([ 'djvutxt', filename, '--detail=page'], stdout=subprocess.PIPE, close_fds = True) text = ls.stdout.read() ls.wait() for t in re.finditer(u'\((page -?\d+ -?\d+ -?\d+ -?\d+[ \n]+"(.*)"[ ]*|)\)\n', text): t = unicode(t.group(1), 'utf-8', 'replace') t = re.sub(u'^page \d+ \d+ \d+ \d+[ \n]+"', u'', t) t = re.sub(u'"[ ]*$', u'', t) t = unquote_text_from_djvu(t) data.append(t) os.remove(filename) return sha1, data
def check_and_upload(url, filename, sha1): if not os.path.exists(filename) or utils.sha1(filename) != sha1: if not utils.copy_file_from_url(url, filename, sha1): return False return True
if not os.path.exists(out_filename) or ls.returncode: # in case returncode == 0 print >> sys.stderr, "ocr.ocr() fail to exec tesseract:", ls.returncode, filename fd = open(out_filename, 'w') fd.write('An error occurred during ocr processing: ' + filename) fd.close() fd = open(out_filename) txt = fd.read() fd.close() if tesseract_data_prefix: del os.environ['TESSDATA_PREFIX'] if ls.returncode != 0: print >> sys.stderr, "ocr.ocr() fail to exec tesseract:", ls.returncode, filename return None return txt if __name__ == "__main__": image_filename = 'temp.jpg' url= 'https://upload.wikimedia.org/wikipedia/commons/thumb/f/f4/Petitot_-_Collection_compl%C3%A8te_des_m%C3%A9moires_relatifs_%C3%A0_l%E2%80%99histoire_de_France%2C_2e_s%C3%A9rie%2C_tome_45.djvu/page280-1024px-Petitot_-_Collection_compl%C3%A8te_des_m%C3%A9moires_relatifs_%C3%A0_l%E2%80%99histoire_de_France%2C_2e_s%C3%A9rie%2C_tome_45.djvu.jpg' lang = 'fr' utils.copy_file_from_url(url, image_filename) print ocr(image_filename, image_filename, tesseract_languages[lang], config = 'hocr') os.remove(image_filename) os.remove(image_filename + ".txt")
def copy_ia_file(ia_id, metadata): base_url = 'https://archive.org/download/%s/' % ia_id utils.copy_file_from_url(base_url + metadata['name'], metadata['name'], expect_sha1 = metadata['sha1'])