def job_thread(queue): cache_dir1 = 'verify_match_diff' cached_diff = lifo_cache.LifoCache(cache_dir1) cache_dir2 = 'verify_match_text_layer' cached_text = lifo_cache.LifoCache(cache_dir2) while True: title, codelang, user, t, tools, conn = queue.get() time1 = time.time() out = '' try: mysite = pywikibot.getSite(codelang, 'wikisource') except: out = ret_val(E_ERROR, "site error: " + repr(codelang)) mysite = False if mysite: out = do_match(mysite, cached_diff, cached_text, title, user) if tools and conn: tools.send_reply(conn, out) conn.close() time2 = time.time() print( date_s(time2) + title + ' ' + user + " " + codelang + " (%.2f)" % (time2 - time1)).encode('utf-8') queue.remove()
def do_match(mysite, maintitle, user, codelang): prefix = page_prefixes['wikisource'].get(codelang) if not prefix: return ret_val(E_ERROR, "no prefix") page = pywikibot.Page(mysite, maintitle) try: text = page.get() except: return ret_val(E_ERROR, "failed to get page") if text.find("{{R2Mondes") != -1: global pl_dict pl_dict = {} p0 = re.compile("\{\{R2Mondes\|(\d+)\|(\d+)\|(\d+)\}\}\s*\n") try: new_text = p0.sub(repl, text) except pywikibot.NoPage: return ret_val(E_ERROR, "Erreur : impossible de trouver l'index") p = re.compile('==\[\[Page:([^=]+)\]\]==\n') cache = lifo_cache.LifoCache('match_and_split_text_layer') bl = p.split(new_text) for i in range(len(bl) / 2): title = bl[i * 2 + 1] content = bl[i * 2 + 2] filename, pagenum = title.split('/') if i == 0: cached_text = align.get_djvu(cache, mysite, filename, True) else: cached_text = align.get_djvu(cache, mysite, filename, False) if not cached_text: return ret_val(E_ERROR, "Erreur : fichier absent") if content.find("R2Mondes") != -1: p0 = re.compile("\{\{R2Mondes\|\d+\|\d+\|(\d+)\}\}\s*\n") bl0 = p0.split(text) title0 = bl0[i * 2 + 1].encode("utf8") return ret_val( E_ERROR, "Erreur : Syntaxe 'R2Mondes' incorrecte, dans la page " + title0) r = align.match_page(content, cached_text[int(pagenum) - 1]) print "%s %s : %f" % (filename, pagenum, r) if r < 0.1: return ret_val( E_ERROR, "Erreur : Le texte ne correspond pas, page %s" % pagenum) #the page is ok new_text = re.sub(u'<references[ ]*/>', u'', new_text) new_text = re.sub(u'[ ]([,])', u'\\1', new_text) new_text = re.sub(u'([^.])[ ]([,.])', u'\\1\\2', new_text) new_text = re.sub(u'\.\.\.', u'…', new_text) new_text = re.sub(u'([^ \s])([;:!?])', u'\\1 \\2', new_text) new_text = re.sub(u'([«;:!?])([^ \s…])', u'\\1 \\2', new_text) # separated from the previous regexp else "word!»" overlap new_text = re.sub(u'([^ \s])([»])', u'\\1 \\2', new_text) # workaround some buggy text new_text = re.sub(u'([;:!?»]) \n', u'\\1\n', new_text) new_text = re.sub(u'([;:!?»])\'\'([ \n])', u'\\1\'\'\\2', new_text) # < ><space> #new_text = re.sub(u' ([;:!?»])', u' \\1', new_text) #new_text = re.sub(u' ([;:!?»])', u' \\1', new_text) new_text = re.sub(u'([;:!?»]) <br />', u'\\1<br />', new_text) new_text = new_text.replace(u'Page : ', u'Page:') new_text = new_text.replace(u'\n: ', u'\n:') new_text = new_text.replace(u'\n:: ', u'\n::') new_text = new_text.replace(u'\n::: ', u'\n:::') new_text = new_text.replace(u'\n:::: ', u'\n::::') new_text = new_text.replace(u'\n::::: ', u'\n:::::') new_text = re.sub( u'1er (janvier|février|avril|mars|mai|juin|juillet|août|septembre|octobre|novembre|décembre)', u'1{{er}} \\1', new_text) new_text = re.sub(u'([0-9])e ', u'\\1{{e}} ', new_text) #text = re.sub(u'([;:!?»]) <div>\n', u'\\1\n', new_text) # try to move the title inside the M&S match_title = re.search(u"{{[Jj]ournal[ ]*\|*(.*?)\|", new_text) if match_title: pos = re.search(u'==(.*?)==', new_text) if pos: new_text = new_text[ 0:pos.end(0)] + u'\n{{c|' + match_title.group( 1) + u'|fs=140%}}\n\n\n' + new_text[pos.end(0):] safe_put(page, new_text, user + ": match") jobs['number_of_split_job'] += 1 # FIXME: can we pass the request here and use a callback in the js? # FIXME: server is None? jobs['split_queue'].put(maintitle, codelang, user, time.time(), None, None, None) # FIXME: that's an abuse of E_ERROR return ret_val(E_ERROR, "ok : transfert en cours.") prefix = prefix.decode('utf-8') p = re.compile("==__MATCH__:\[\[" + prefix + ":(.*?)/(\d+)\]\]==") m = re.search(p, text) if m: djvuname = m.group(1) number = m.group(2) pos = text.find(m.group(0)) head = text[:pos] text = text[pos + len(m.group(0)):] else: return ret_val(E_ERROR, "match tag not found") pywikibot.output(djvuname + " " + number) try: number = int(number) except: return ret_val(E_ERROR, "illformed __MATCH__: no page number ?") cache = lifo_cache.LifoCache('match_and_split_text_layer') cached_text = align.get_djvu(cache, mysite, djvuname, True) if not cached_text: return ret_val( E_ERROR, "unable to read djvu, if the File: exists, please retry") data = align.do_match(text, cached_text, djvuname, number, verbose=False, prefix=prefix) if not data['error']: safe_put(page, head + data['text'], user + ": match") data['text'] = "" return data
out = ret_val(E_ERROR, "site error: " + repr(codelang)) mysite = False if mysite: out = do_extract(mysite, title, user, codelang, cache) if tools and conn: tools.send_reply(conn, out) conn.close() time2 = time.time() print (date_s(time2) + title + ' ' + user + " " + codelang + " (%.2f)" % (time2-time1)).encode('utf-8') queue.remove() if __name__ == "__main__": try: cache_dir = 'extract_text_layer' if not os.path.exists(os.path.expanduser('~/cache/' + cache_dir)): os.mkdir(os.path.expanduser('~/cache/' + cache_dir)) cache = lifo_cache.LifoCache(cache_dir) queue = job_queue.JobQueue() thread.start_new_thread(job_thread, (queue, cache)) bot_listening(queue) except KeyboardInterrupt: pywikibot.stopme() os._exit(1) finally: pywikibot.stopme()