def ocr_image(cache, url, codelang): # This is checked in bot_listening but must be redone here, so if # the ocr for the same page is asked multiple time, we will do the ocr # only once. text = get_from_cache(cache, url, codelang) if text: return ret_val(0, text) url = url.encode('utf-8') cache_key = image_key(url) lang = ocr.tesseract_languages.get(codelang, 'eng') basename = os.path.expanduser('~/tmp') + '/tesseract/image_%s' % cache_key image_filename = basename + ".jpg" utils.copy_file_from_url(url, image_filename) if not os.path.exists(image_filename): return ret_val(1, "could not download url: %s" % url) text = ocr.ocr(image_filename, basename, lang) if text == None: return ret_val(2, "ocr failed") os.remove(image_filename) if os.path.exists(basename + ".txt"): os.remove(basename + ".txt") cache.set(cache_key, text) return ret_val(0, text)
def extract_djvu_text(url, filename, sha1): print "extracting text layer" if type(filename) == type(u''): filename = filename.encode('utf-8') utils.copy_file_from_url(url, filename, sha1) data = [] # GTK app are very touchy os.environ['LANG'] = 'en_US.UTF8' # FIXME: check return code ls = subprocess.Popen([ 'djvutxt', filename, '--detail=page'], stdout=subprocess.PIPE, close_fds = True) text = ls.stdout.read() ls.wait() for t in re.finditer(u'\((page -?\d+ -?\d+ -?\d+ -?\d+[ \n]+"(.*)"[ ]*|)\)\n', text): t = unicode(t.group(1), 'utf-8', 'replace') t = re.sub(u'^page \d+ \d+ \d+ \d+[ \n]+"', u'', t) t = re.sub(u'"[ ]*$', u'', t) t = unquote_text_from_djvu(t) data.append(t) os.remove(filename) return sha1, data
fd.write('An error occurred during ocr processing: ' + filename) fd.close() fd = open(out_filename) txt = fd.read() fd.close() if tesseract_data_prefix: del os.environ['TESSDATA_PREFIX'] if ls.returncode != 0: print >> sys.stderr, "ocr.ocr() fail to exec tesseract:", ls.returncode, filename return None return txt if __name__ == "__main__": import os image_filename = 'temp.jpg' url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/f/f4/Petitot_-_Collection_compl%C3%A8te_des_m%C3%A9moires_relatifs_%C3%A0_l%E2%80%99histoire_de_France%2C_2e_s%C3%A9rie%2C_tome_45.djvu/page280-1024px-Petitot_-_Collection_compl%C3%A8te_des_m%C3%A9moires_relatifs_%C3%A0_l%E2%80%99histoire_de_France%2C_2e_s%C3%A9rie%2C_tome_45.djvu.jpg' lang = 'fr' utils.copy_file_from_url(url, image_filename) print ocr(image_filename, image_filename, tesseract_languages[lang], config='hocr') os.remove(image_filename) os.remove(image_filename + ".txt")
# in case returncode == 0 print >> sys.stderr, "ocr.ocr() fail to exec tesseract:", ls.returncode, filename fd = open(out_filename, 'w') fd.write('An error occurred during ocr processing: ' + filename) fd.close() fd = open(out_filename) txt = fd.read() fd.close() if tesseract_data_prefix: del os.environ['TESSDATA_PREFIX'] if ls.returncode != 0: print >> sys.stderr, "ocr.ocr() fail to exec tesseract:", ls.returncode, filename return None return txt if __name__ == "__main__": import os image_filename = 'temp.jpg' url= 'https://upload.wikimedia.org/wikipedia/commons/thumb/f/f4/Petitot_-_Collection_compl%C3%A8te_des_m%C3%A9moires_relatifs_%C3%A0_l%E2%80%99histoire_de_France%2C_2e_s%C3%A9rie%2C_tome_45.djvu/page280-1024px-Petitot_-_Collection_compl%C3%A8te_des_m%C3%A9moires_relatifs_%C3%A0_l%E2%80%99histoire_de_France%2C_2e_s%C3%A9rie%2C_tome_45.djvu.jpg' lang = 'fr' utils.copy_file_from_url(url, image_filename) print ocr(image_filename, image_filename, tesseract_languages[lang], config = 'hocr') os.remove(image_filename) os.remove(image_filename + ".txt")
def copy_file(lang, family, filename, dest): site = pywikibot.getSite(lang, family) page = get_filepage(site, unicode(filename, 'utf-8')) url = page.fileUrl() utils.copy_file_from_url(url, dest, page.getFileSHA1Sum())
def check_and_upload(url, filename, sha1): if not os.path.exists(filename) or utils.sha1(filename) != sha1: if not utils.copy_file_from_url(url, filename, sha1): return False return True
os.environ['GSDJVU'] = gsdjvu out_file = in_file[:-3] + 'djvu' djvudigital = djvulibre_path + 'djvudigital' # --words option is useless as many pdf contains text layer only for # the first page ls = subprocess.Popen([ djvudigital, "--dpi=300", in_file, out_file], stdout=subprocess.PIPE, preexec_fn=setrlimits, close_fds = True) text = ls.stdout.read() if text: print text ls.wait() if ls.returncode != 0: print >> sys.stderr, "djvudigital fail: ", ls.returncode, in_file out_file = None if gsdjvu: del os.environ['GSDJVU'] return out_file if __name__ == "__main__": import utils in_file = 'https://upload.wikimedia.org/wikipedia/commons/8/81/Accord_compl%C3%A9mentaire_relatif_%C3%A0_la_Malaisie_le_11_Septembre_1963.pdf' out_file = os.path.expanduser('~/tmp/') + 'Accord complémentaire relatif à la Malaisie le 11 Septembre 1963.pdf' utils.copy_file_from_url(in_file, out_file) djvu_name = pdf_to_djvu(out_file) os.remove(out_file) #os.remove(djvu_name)