def handle_query(params, start_response): print >> sys.stderr, params if params['lang'] and params['book']: try: ret_code = '200 OK' result = hocr.get_hocr(params['lang'], params['book']) except: utils.print_traceback() ret_code = '500 Internal Server Error' result = { 'error' : 1, 'text' : ret_code } else: ret_code = '400 Bad Request' result = { 'error' : 1, 'text' : ret_code } try: text = json.dumps(result) except UnicodeDecodeError: print >> sys.stderr, result ret_code = '400 Bad Request' text = json.dumps({ 'error' : 1, 'text' : ret_code }) start_response(ret_code, [('Content-Type', 'application/json' + '; charset=UTF-8'), ('Content-Length', str(len(text))), ('Access-Control-Allow-Origin', '*')]) return [ text ]
def extract_image(opt, page_nr, filename): try: width, height = image_size(page_nr, filename) subsample = 1 while (width*height) / subsample > (1 << 20) * 50: subsample += 1 subsample = min(subsample, 12) except Exception: utils.print_traceback("Unable to get image size, subsample=1", filename) subsample = 1 if subsample != 1: print "subsample", subsample tiff_name = opt.out_dir + 'page_%04d.tif' % page_nr ddjvu = djvulibre_path + 'ddjvu' ls = subprocess.Popen([ ddjvu, "-format=tiff", "-page=%d" % page_nr, "-subsample=%d" % subsample, filename, tiff_name], stdout=subprocess.PIPE, preexec_fn=setrlimits, close_fds = True) text = utils.safe_read(ls.stdout) if text: print text ls.wait() if ls.returncode != 0: print >> sys.stderr, "extract_image fail: ", ls.returncode, filename, page_nr return None return tiff_name
def handle_query(params, start_response): log(params) if params['lang'] and params['book']: try: ret_code = '200 OK' result = hocr.get_hocr(params['lang'], params['book']) except: utils.print_traceback() ret_code = '500 Internal Server Error' result = {'error': 1, 'text': ret_code} else: ret_code = '400 Bad Request' result = {'error': 1, 'text': ret_code} try: text = json.dumps(result) except UnicodeDecodeError: log(result) ret_code = '400 Bad Request' text = json.dumps({'error': 1, 'text': ret_code}) start_response(ret_code, [('Content-Type', 'application/json' + '; charset=UTF-8'), ('Content-Length', str(len(text))), ('Access-Control-Allow-Origin', '*')]) return [text]
def extract_image(opt, page_nr, filename): try: width, height = image_size(page_nr, filename) subsample = 1 while (width * height) / subsample > (1 << 20) * 50: subsample += 1 subsample = min(subsample, 12) except Exception: utils.print_traceback("Unable to get image size, subsample=1", filename) subsample = 1 if subsample != 1: print "subsample", subsample tiff_name = opt.temp_tiff_dir + '/page_%04d.tif' % page_nr ddjvu = djvulibre_path + 'ddjvu' ls = subprocess.Popen([ ddjvu, "-format=tiff", "-page=%d" % page_nr, "-subsample=%d" % subsample, filename, tiff_name ], stdout=subprocess.PIPE, preexec_fn=setrlimits, close_fds=True) text = utils.safe_read(ls.stdout) if text: print text ls.wait() if ls.returncode != 0: print >> sys.stderr, "extract_image fail: ", ls.returncode, filename, page_nr return None return tiff_name
def parse(opt, filename): try: ret_code = do_parse(opt, filename) except Exception: utils.print_traceback(filename) ret_code = -1 return ret_code
def do_file(job_queue, opt, filename): while True: page_nr = job_queue.get() if page_nr == None: print "Stopping thread" return try: do_one_page(opt, page_nr, filename) except Exception: utils.print_traceback(filename)
def handle_suggest_query(params, start_response): if params['lang'] and params['title']: try: modernize = modernization.Modernization(params['lang']) result = modernize.suggest_dict(params['title']) ret_code = '200 OK' except: utils.print_traceback() ret_code = '500 Internal Server Error' result = { 'error' : 1, 'text' : ret_code } else: ret_code = '400 Bad Request' result = { 'error' : 1, 'text' : ret_code } return return_response(start_response, result, True, ret_code, 'application/json')
def handle_suggest_query(params, start_response): if params['lang'] and params['title']: try: modernize = modernization.Modernization(params['lang']) result = modernize.suggest_dict(params['title']) ret_code = '200 OK' except: utils.print_traceback() ret_code = '500 Internal Server Error' result = {'error': 1, 'text': ret_code} else: ret_code = '400 Bad Request' result = {'error': 1, 'text': ret_code} return return_response(start_response, result, True, ret_code, 'application/json')
def html_for_queue(queue): html = u'' for i in queue: mtitle = i[0] codelang = i[1] try: msite = pywikibot.getSite(codelang, 'wikisource') page = pywikibot.Page(msite, mtitle) path = msite.nice_get_address(page.title(asUrl = True)) url = '%s://%s%s' % (msite.protocol(), msite.hostname(), path) except BaseException: utils.print_traceback() url = "" html += date_s(i[3])+' '+i[2]+" "+i[1]+" <a href=\""+url+"\">"+i[0]+"</a><br/>" return html
def handle_blacklist_query(params, start_response): if params['lang'] and params['blacklist']: try: modernize = modernization.Modernization(params['lang']) blacklist = json.loads(params['blacklist']) modernize.save_blacklist(blacklist) ret_code = '200 OK' result = { 'error' : 0, 'text' :'OK' } except: utils.print_traceback() ret_code = '500 Internal Server Error' result = { 'error' : 1, 'text' : ret_code } else: ret_code = '400 Bad Request' result = { 'error' : 1, 'text' : ret_code } return return_response(start_response, result, True, ret_code, 'application/json')
def handle_blacklist_query(params, start_response): if params['lang'] and params['blacklist']: try: modernize = modernization.Modernization(params['lang']) blacklist = json.loads(params['blacklist']) modernize.save_blacklist(blacklist) ret_code = '200 OK' result = {'error': 0, 'text': 'OK'} except: utils.print_traceback() ret_code = '500 Internal Server Error' result = {'error': 1, 'text': ret_code} else: ret_code = '400 Bad Request' result = {'error': 1, 'text': ret_code} return return_response(start_response, result, True, ret_code, 'application/json')
def get_djvu(cache, mysite, djvuname, check_timestamp=False): print "get_djvu", repr(djvuname) djvuname = djvuname.replace(" ", "_") cache_filename = djvuname + '.dat' obj = cache.get(cache_filename) if not obj: print "CACHE MISS" filepage = copy_File.get_filepage(mysite, djvuname) if not filepage: # can occur if File: has been deleted return None try: url = filepage.fileUrl() obj = extract_djvu_text(url, djvuname, filepage.getFileSHA1Sum()) except: utils.print_traceback("extract_djvu_text() fail") obj = None if obj: cache.set(cache_filename, obj) else: return None else: if check_timestamp: filepage = copy_File.get_filepage(mysite, djvuname) if not filepage: # can occur if File: has been deleted return None sha1 = filepage.getFileSHA1Sum() if sha1 != obj[0]: print "OUTDATED FILE" url = filepage.fileUrl() try: obj = extract_djvu_text(url, djvuname, sha1) cache.set(cache_filename, obj) except: return None return obj[1]
def get_djvu(cache, mysite, djvuname, check_timestamp = False): print "get_djvu", repr(djvuname) djvuname = djvuname.replace(" ", "_") cache_filename = djvuname + '.dat' obj = cache.get(cache_filename) if not obj: print "CACHE MISS" filepage = copy_File.get_filepage(mysite, djvuname) if not filepage: # can occur if File: has been deleted return None try: url = filepage.fileUrl() obj = extract_djvu_text(url, djvuname, filepage.getFileSHA1Sum()) except: utils.print_traceback("extract_djvu_text() fail") obj = None if obj: cache.set(cache_filename, obj) else: return None else: if check_timestamp: filepage = copy_File.get_filepage(mysite, djvuname) if not filepage: # can occur if File: has been deleted return None sha1 = filepage.getFileSHA1Sum() if sha1 != obj[0]: print "OUTDATED FILE" url = filepage.fileUrl() try: obj = extract_djvu_text(url, djvuname, sha1) cache.set(cache_filename, obj) except: return None return obj[1]
def exec_request(self, r): sge_job_nr = 0 # This is a bit convoluted but we need it to avoid a race condition: # we set the job as running before starting it so on if this script # run twice in parallel we don't try to start the same job twice. Then # when the job really started or fail to start we update its state # again. As we don't know yet the sge job number, we setup it as zero. # Note this could be done in pending_request() but I prefer to protect # it locally. really_pending = False with db.connection(self): q = 'UPDATE job SET job_state=%s, sge_jobnumber=%s WHERE job_id=%s AND job_state="pending"' if self.cursor.execute(q, [ 'running', 0, r['job_id'] ]): really_pending = True if not really_pending: print >> sys.stderr, "run request for job_id %s cancelled, as it's no longer pending" % r['job_id'] return cmdline_arg = job_cmdline_arg(r, 'job_run_cmd') sge_cmdline = sge_cmdline_arg(r) ls = subprocess.Popen(sge_cmdline + cmdline_arg, stdin=None, stdout=subprocess.PIPE, close_fds = True) text = ls.stdout.read() ls.wait() try: sge_job_nr = int(re.search('Your job (\d+) ', text).group(1)) new_state = 'running' except: utils.print_traceback("sge failure to exec job: %d" % r['job_id'], text) new_state = 'sge_fail' # Now we can really update the job state, see comment above. with db.connection(self): q = 'UPDATE job SET job_state=%s, sge_jobnumber=%s WHERE job_id=%s' self.cursor.execute(q, [ new_state, sge_job_nr, r['job_id'] ])
def handle_scan_query(params, start_response): text = common_html.get_head('pages without scan', css = 'shared.css').encode('utf-8') + '\n <body>\n' if params['lang']: try: offset = int(params.get('offset', 0)) limit = min(500, int(params.get('limit', 500))) lang = params['lang'] conn = db.create_conn(domain = lang, family = 'wikisource') cursor = db.use_db(conn, domain = lang, family = 'wikisource') ns = ws_category.domain_urls[lang][0] result = pages_without_scan(ns, cursor) result_len = len(result) result = result[offset:offset+limit] result = [( unicode(x[0], 'utf-8'), x[1]) for x in result] text += 'Total: ' + str(result_len) + '<br />' next_link = prev_next_link(False, result_len, lang, limit, offset) prev_link = prev_next_link(True, result_len, lang, limit, offset) text += prev_link + ' ' + next_link + '<br /><br />' for x in result: text += u'<a href="//%s.wikisource.org/wiki/%s">' % (lang, x[0]) + x[0].replace('_', ' ') + u'</a>, ' + str(x[1]) + u'<br />' text += u'<br />' + prev_link + ' ' + next_link cursor.close() conn.close() ret_code = '200 OK' except: utils.print_traceback() ret_code = '500 Internal Server Error' text = '<h1>' + ret_code + '</h1>' else: ret_code = '400 Bad Request' text = '<h1>' + ret_code + '</h1>' text += ' </body>\n</html>' return return_response(start_response, text.encode('utf-8'), False, ret_code, 'text/html')
print >> sys.stderr, 'missing option -lang: and/or -book:', sys.argv exit(1) ret = is_uptodate(options.lang, options.book) if ret > 0: if not hocr(options): print >> sys.stderr, 'Error, hocr fail' ret = 2 else: update_db(options.lang, options.book) ret = 0 elif ret < 0: print >> sys.stderr, "Error, file doesn't exist:", ret ret = 3 + abs(ret) else: update_db(options.lang, options.book) return ret if __name__ == '__main__': cache_dir = 'hocr' if not os.path.exists(os.path.expanduser('~/cache/' + cache_dir)): os.mkdir(os.path.expanduser('~/cache/' + cache_dir)) try: ret = main() except: utils.print_traceback() exit(4) exit(ret)
def do_match(mysite, maintitle, user, codelang): prefix = page_prefixes['wikisource'].get(codelang) if not prefix: return ret_val(E_ERROR, "no prefix") page = pywikibot.Page(mysite, maintitle) try: text = page.get() except: utils.print_traceback("failed to get page") return ret_val(E_ERROR, "failed to get page") if text.find("{{R2Mondes")!=-1: global pl_dict pl_dict = {} p0 = re.compile("\{\{R2Mondes\|(\d+)\|(\d+)\|(\d+)\}\}\s*\n") try: new_text = p0.sub(repl, text) except pywikibot.NoPage: return ret_val(E_ERROR, "Erreur : impossible de trouver l'index") p = re.compile('==\[\[Page:([^=]+)\]\]==\n') cache = lifo_cache.LifoCache('match_and_split_text_layer') bl= p.split(new_text) for i in range(len(bl)/2): title = bl[i*2+1] content = bl[i*2+2] filename, pagenum = title.split('/') if i == 0: cached_text = align.get_djvu(cache, mysite, filename, True) else: cached_text = align.get_djvu(cache, mysite, filename, False) if not cached_text: return ret_val(E_ERROR, "Erreur : fichier absent") if content.find("R2Mondes") != -1: p0 = re.compile("\{\{R2Mondes\|\d+\|\d+\|(\d+)\}\}\s*\n") bl0 = p0.split(text) title0 = bl0[i*2+1].encode("utf8") return ret_val(E_ERROR, "Erreur : Syntaxe 'R2Mondes' incorrecte, dans la page "+title0) r = align.match_page(content, cached_text[int(pagenum)-1]) print "%s %s : %f"%(filename, pagenum, r) if r < 0.1: return ret_val(E_ERROR, "Erreur : Le texte ne correspond pas, page %s" % pagenum) #the page is ok new_text = re.sub(u'<references[ ]*/>', u'', new_text) new_text = re.sub(u'[ ]([,])', u'\\1', new_text) new_text = re.sub(u'([^.])[ ]([,.])', u'\\1\\2', new_text) new_text = re.sub(u'\.\.\.', u'…', new_text) new_text = re.sub(u'([^ \s])([;:!?])', u'\\1 \\2', new_text) new_text = re.sub(u'([«;:!?])([^ \s…])', u'\\1 \\2', new_text) # separated from the previous regexp else "word!»" overlap new_text = re.sub(u'([^ \s])([»])', u'\\1 \\2', new_text) # workaround some buggy text new_text = re.sub(u'([;:!?»]) \n', u'\\1\n', new_text) new_text = re.sub(u'([;:!?»])\'\'([ \n])', u'\\1\'\'\\2', new_text) # < ><space> #new_text = re.sub(u' ([;:!?»])', u' \\1', new_text) #new_text = re.sub(u' ([;:!?»])', u' \\1', new_text) new_text = re.sub(u'([;:!?»]) <br />', u'\\1<br />', new_text) new_text = new_text.replace(u'Page : ', u'Page:') new_text = new_text.replace(u'\n: ', u'\n:') new_text = new_text.replace(u'\n:: ', u'\n::') new_text = new_text.replace(u'\n::: ', u'\n:::') new_text = new_text.replace(u'\n:::: ', u'\n::::') new_text = new_text.replace(u'\n::::: ', u'\n:::::') new_text = re.sub(u'1er (janvier|février|avril|mars|mai|juin|juillet|août|septembre|octobre|novembre|décembre)', u'1{{er}} \\1', new_text) new_text = re.sub(u'([0-9])e ', u'\\1{{e}} ', new_text) #text = re.sub(u'([;:!?»]) <div>\n', u'\\1\n', new_text) # try to move the title inside the M&S match_title = re.search(u"{{[Jj]ournal[ ]*\|*(.*?)\|", new_text) if match_title: pos = re.search(u'==(.*?)==', new_text) if pos: new_text = new_text[0:pos.end(0)] + u'\n{{c|' + match_title.group(1) + u'|fs=140%}}\n\n\n' + new_text[pos.end(0):] safe_put(page,new_text,user+": match") jobs['number_of_split_job'] += 1 # FIXME: can we pass the request here and use a callback in the js? # FIXME: server is None? jobs['split_queue'].put(maintitle, codelang, user, time.time(), None, None, None) # FIXME: that's an abuse of E_ERROR return ret_val(E_ERROR, "ok : transfert en cours.") prefix = prefix.decode('utf-8') p = re.compile("==__MATCH__:\[\[" + prefix + ":(.*?)/(\d+)(\|step=(\d+))?\]\]==") m = re.search(p,text) if m: djvuname = m.group(1) number = m.group(2) pos = text.find(m.group(0)) head = text[:pos] text = text[pos+len(m.group(0)):] if m.group(4): try: step = int(m.group(4)) except: return ret_val(E_ERROR, "match tag invalid") else: step = 1 else: return ret_val(E_ERROR, "match tag not found") pywikibot.output(djvuname + " " + number + " " + str(step)) try: number = int(number) except: return ret_val(E_ERROR, "illformed __MATCH__: no page number ?") cache = lifo_cache.LifoCache('match_and_split_text_layer') cached_text = align.get_djvu(cache, mysite, djvuname, True) if not cached_text: return ret_val(E_ERROR, "unable to read djvu, if the File: exists, please retry") data = align.do_match(text, cached_text, djvuname, number, verbose = False, prefix = prefix, step = step) if not data['error']: safe_put(page, head + data['text'], user + ": match") data['text'] = "" return data
log(result) ret_code = '400 Bad Request' text = json.dumps({'error': 1, 'text': ret_code}) start_response(ret_code, [('Content-Type', 'application/json' + '; charset=UTF-8'), ('Content-Length', str(len(text))), ('Access-Control-Allow-Origin', '*')]) return [text] def myapp(environ, start_response): params = query_params(environ) if params['cmd'] == 'ping': return handle_ping(start_response) elif params['cmd'] == 'hocr': return handle_query(params, start_response) else: return handle_status(params, start_response) if __name__ == "__main__": sys.stderr = open(os.path.expanduser('~/log/hocr_cgi.err'), 'a') from flup.server.cgi import WSGIServer try: WSGIServer(myapp).run() except BaseException: utils.print_traceback()
def do_match(mysite, maintitle, user, codelang): prefix = page_prefixes['wikisource'].get(codelang) if not prefix: return ret_val(E_ERROR, "no prefix") page = pywikibot.Page(mysite, maintitle) try: text = page.get() except: utils.print_traceback("failed to get page") return ret_val(E_ERROR, "failed to get page") if text.find("{{R2Mondes") != -1: global pl_dict pl_dict = {} p0 = re.compile("\{\{R2Mondes\|(\d+)\|(\d+)\|(\d+)\}\}\s*\n") try: new_text = p0.sub(repl, text) except pywikibot.NoPage: return ret_val(E_ERROR, "Erreur : impossible de trouver l'index") p = re.compile('==\[\[Page:([^=]+)\]\]==\n') cache = lifo_cache.LifoCache('match_and_split_text_layer') bl = p.split(new_text) for i in range(len(bl) / 2): title = bl[i * 2 + 1] content = bl[i * 2 + 2] filename, pagenum = title.split('/') if i == 0: cached_text = align.get_djvu(cache, mysite, filename, True) else: cached_text = align.get_djvu(cache, mysite, filename, False) if not cached_text: return ret_val(E_ERROR, "Erreur : fichier absent") if content.find("R2Mondes") != -1: p0 = re.compile("\{\{R2Mondes\|\d+\|\d+\|(\d+)\}\}\s*\n") bl0 = p0.split(text) title0 = bl0[i * 2 + 1].encode("utf8") return ret_val( E_ERROR, "Erreur : Syntaxe 'R2Mondes' incorrecte, dans la page " + title0) r = align.match_page(content, cached_text[int(pagenum) - 1]) print "%s %s : %f" % (filename, pagenum, r) if r < 0.1: return ret_val( E_ERROR, "Erreur : Le texte ne correspond pas, page %s" % pagenum) #the page is ok new_text = re.sub(u'<references[ ]*/>', u'', new_text) new_text = re.sub(u'[ ]([,])', u'\\1', new_text) new_text = re.sub(u'([^.])[ ]([,.])', u'\\1\\2', new_text) new_text = re.sub(u'\.\.\.', u'…', new_text) new_text = re.sub(u'([^ \s])([;:!?])', u'\\1 \\2', new_text) new_text = re.sub(u'([«;:!?])([^ \s…])', u'\\1 \\2', new_text) # separated from the previous regexp else "word!»" overlap new_text = re.sub(u'([^ \s])([»])', u'\\1 \\2', new_text) # workaround some buggy text new_text = re.sub(u'([;:!?»]) \n', u'\\1\n', new_text) new_text = re.sub(u'([;:!?»])\'\'([ \n])', u'\\1\'\'\\2', new_text) # < ><space> #new_text = re.sub(u' ([;:!?»])', u' \\1', new_text) #new_text = re.sub(u' ([;:!?»])', u' \\1', new_text) new_text = re.sub(u'([;:!?»]) <br />', u'\\1<br />', new_text) new_text = new_text.replace(u'Page : ', u'Page:') new_text = new_text.replace(u'\n: ', u'\n:') new_text = new_text.replace(u'\n:: ', u'\n::') new_text = new_text.replace(u'\n::: ', u'\n:::') new_text = new_text.replace(u'\n:::: ', u'\n::::') new_text = new_text.replace(u'\n::::: ', u'\n:::::') new_text = re.sub( u'1er (janvier|février|avril|mars|mai|juin|juillet|août|septembre|octobre|novembre|décembre)', u'1{{er}} \\1', new_text) new_text = re.sub(u'([0-9])e ', u'\\1{{e}} ', new_text) #text = re.sub(u'([;:!?»]) <div>\n', u'\\1\n', new_text) # try to move the title inside the M&S match_title = re.search(u"{{[Jj]ournal[ ]*\|*(.*?)\|", new_text) if match_title: pos = re.search(u'==(.*?)==', new_text) if pos: new_text = new_text[ 0:pos.end(0)] + u'\n{{c|' + match_title.group( 1) + u'|fs=140%}}\n\n\n' + new_text[pos.end(0):] safe_put(page, new_text, user + ": match") jobs['number_of_split_job'] += 1 # FIXME: can we pass the request here and use a callback in the js? # FIXME: server is None? jobs['split_queue'].put(maintitle, codelang, user, time.time(), None, None, None) # FIXME: that's an abuse of E_ERROR return ret_val(E_ERROR, "ok : transfert en cours.") prefix = prefix.decode('utf-8') p = re.compile("==__MATCH__:\[\[" + prefix + ":(.*?)/(\d+)(\|step=(\d+))?\]\]==") m = re.search(p, text) if m: djvuname = m.group(1) number = m.group(2) pos = text.find(m.group(0)) head = text[:pos] text = text[pos + len(m.group(0)):] if m.group(4): try: step = int(m.group(4)) except: return ret_val(E_ERROR, "match tag invalid") else: step = 1 else: return ret_val(E_ERROR, "match tag not found") pywikibot.output(djvuname + " " + number + " " + str(step)) try: number = int(number) except: return ret_val(E_ERROR, "illformed __MATCH__: no page number ?") cache = lifo_cache.LifoCache('match_and_split_text_layer') cached_text = align.get_djvu(cache, mysite, djvuname, True) if not cached_text: return ret_val( E_ERROR, "unable to read djvu, if the File: exists, please retry") data = align.do_match(text, cached_text, djvuname, number, verbose=False, prefix=prefix, step=step) if not data['error']: safe_put(page, head + data['text'], user + ": match") data['text'] = "" return data