def write_templates(res): import pywikibot from common.pywikibot_utils import safe_put for dom in [ 'fr','en', 'bn', 'pl' ]: if dom=='fr': sep=' ' elif dom == 'en': sep=',' else: sep = '' num, num_q0, num_q2, num_q3, num_q4, num_tr, num_texts, num_disambig = decode_res( res[dom] ) percent = num_tr*100./(num_texts-num_disambig) num_q1 = num - (num_q0 + num_q2 + num_q3 + num_q4 ) site = pywikibot.getSite(dom,fam='wikisource') page = pywikibot.Page(site,"Template:PAGES_NOT_PROOFREAD") safe_put(page, spaced_int(num_q1,sep), "") page = pywikibot.Page(site,"Template:ALL_PAGES") safe_put(page, spaced_int(num,sep), "") page = pywikibot.Page(site,"Template:PR_TEXTS") safe_put(page, spaced_int(num_tr,sep), "") page = pywikibot.Page(site,"Template:ALL_TEXTS") safe_put(page, spaced_int(num_texts - num_disambig,sep), "") page = pywikibot.Page(site,"Template:PR_PERCENT") safe_put(page, "%.2f"%percent, "")
def do_extract(mysite, maintitle, user, codelang, cache): prefix = unicode(page_prefixes['wikisource'].get(codelang), 'utf-8') if not prefix: return ret_val(E_ERROR, "no prefix") djvuname = maintitle.replace(u' ', u'_') print djvuname.encode('utf-8') text_layer = align.get_djvu(cache, mysite, djvuname, True) if not text_layer: return ret_val(E_ERROR, "unable to retrieve text layer") text = u'' for pos, page_text in enumerate(text_layer): text += u'==[[' + prefix + u':' + maintitle + u'/' + unicode(pos+1) + u']]==\n' text += page_text + u'\n' page = pywikibot.Page(mysite, u'User:'******'/Text') safe_put(page, text, comment = u'extract text') return ret_val(E_OK, "")
def do_extract(mysite, maintitle, user, codelang, cache): prefix = unicode(page_prefixes['wikisource'].get(codelang), 'utf-8') if not prefix: return ret_val(E_ERROR, "no prefix") djvuname = maintitle.replace(u' ', u'_') print djvuname.encode('utf-8') text_layer = align.get_djvu(cache, mysite, djvuname, True) if not text_layer: return ret_val(E_ERROR, "unable to retrieve text layer") text = u'' for pos, page_text in enumerate(text_layer): text += u'==[[' + prefix + u':' + maintitle + u'/' + unicode( pos + 1) + u']]==\n' text += page_text + u'\n' page = pywikibot.Page(mysite, u'User:'******'/Text') safe_put(page, text, comment=u'extract text') return ret_val(E_OK, "")
def do_match(mysite, maintitle, user, codelang): prefix = page_prefixes['wikisource'].get(codelang) if not prefix: return ret_val(E_ERROR, "no prefix") page = pywikibot.Page(mysite, maintitle) try: text = page.get() except: utils.print_traceback("failed to get page") return ret_val(E_ERROR, "failed to get page") if text.find("{{R2Mondes") != -1: global pl_dict pl_dict = {} p0 = re.compile("\{\{R2Mondes\|(\d+)\|(\d+)\|(\d+)\}\}\s*\n") try: new_text = p0.sub(repl, text) except pywikibot.NoPage: return ret_val(E_ERROR, "Erreur : impossible de trouver l'index") p = re.compile('==\[\[Page:([^=]+)\]\]==\n') cache = lifo_cache.LifoCache('match_and_split_text_layer') bl = p.split(new_text) for i in range(len(bl) / 2): title = bl[i * 2 + 1] content = bl[i * 2 + 2] filename, pagenum = title.split('/') if i == 0: cached_text = align.get_djvu(cache, mysite, filename, True) else: cached_text = align.get_djvu(cache, mysite, filename, False) if not cached_text: return ret_val(E_ERROR, "Erreur : fichier absent") if content.find("R2Mondes") != -1: p0 = re.compile("\{\{R2Mondes\|\d+\|\d+\|(\d+)\}\}\s*\n") bl0 = p0.split(text) title0 = bl0[i * 2 + 1].encode("utf8") return ret_val( E_ERROR, "Erreur : Syntaxe 'R2Mondes' incorrecte, dans la page " + title0) r = align.match_page(content, cached_text[int(pagenum) - 1]) print "%s %s : %f" % (filename, pagenum, r) if r < 0.1: return ret_val( E_ERROR, "Erreur : Le texte ne correspond pas, page %s" % pagenum) #the page is ok new_text = re.sub(u'<references[ ]*/>', u'', new_text) new_text = re.sub(u'[ ]([,])', u'\\1', new_text) new_text = re.sub(u'([^.])[ ]([,.])', u'\\1\\2', new_text) new_text = re.sub(u'\.\.\.', u'…', new_text) new_text = re.sub(u'([^ \s])([;:!?])', u'\\1 \\2', new_text) new_text = re.sub(u'([«;:!?])([^ \s…])', u'\\1 \\2', new_text) # separated from the previous regexp else "word!»" overlap new_text = re.sub(u'([^ \s])([»])', u'\\1 \\2', new_text) # workaround some buggy text new_text = re.sub(u'([;:!?»]) \n', u'\\1\n', new_text) new_text = re.sub(u'([;:!?»])\'\'([ \n])', u'\\1\'\'\\2', new_text) # < ><space> #new_text = re.sub(u' ([;:!?»])', u' \\1', new_text) #new_text = re.sub(u' ([;:!?»])', u' \\1', new_text) new_text = re.sub(u'([;:!?»]) <br />', u'\\1<br />', new_text) new_text = new_text.replace(u'Page : ', u'Page:') new_text = new_text.replace(u'\n: ', u'\n:') new_text = new_text.replace(u'\n:: ', u'\n::') new_text = new_text.replace(u'\n::: ', u'\n:::') new_text = new_text.replace(u'\n:::: ', u'\n::::') new_text = new_text.replace(u'\n::::: ', u'\n:::::') new_text = re.sub( u'1er (janvier|février|avril|mars|mai|juin|juillet|août|septembre|octobre|novembre|décembre)', u'1{{er}} \\1', new_text) new_text = re.sub(u'([0-9])e ', u'\\1{{e}} ', new_text) #text = re.sub(u'([;:!?»]) <div>\n', u'\\1\n', new_text) # try to move the title inside the M&S match_title = re.search(u"{{[Jj]ournal[ ]*\|*(.*?)\|", new_text) if match_title: pos = re.search(u'==(.*?)==', new_text) if pos: new_text = new_text[ 0:pos.end(0)] + u'\n{{c|' + match_title.group( 1) + u'|fs=140%}}\n\n\n' + new_text[pos.end(0):] safe_put(page, new_text, user + ": match") jobs['number_of_split_job'] += 1 # FIXME: can we pass the request here and use a callback in the js? # FIXME: server is None? jobs['split_queue'].put(maintitle, codelang, user, time.time(), None, None, None) # FIXME: that's an abuse of E_ERROR return ret_val(E_ERROR, "ok : transfert en cours.") prefix = prefix.decode('utf-8') p = re.compile("==__MATCH__:\[\[" + prefix + ":(.*?)/(\d+)(\|step=(\d+))?\]\]==") m = re.search(p, text) if m: djvuname = m.group(1) number = m.group(2) pos = text.find(m.group(0)) head = text[:pos] text = text[pos + len(m.group(0)):] if m.group(4): try: step = int(m.group(4)) except: return ret_val(E_ERROR, "match tag invalid") else: step = 1 else: return ret_val(E_ERROR, "match tag not found") pywikibot.output(djvuname + " " + number + " " + str(step)) try: number = int(number) except: return ret_val(E_ERROR, "illformed __MATCH__: no page number ?") cache = lifo_cache.LifoCache('match_and_split_text_layer') cached_text = align.get_djvu(cache, mysite, djvuname, True) if not cached_text: return ret_val( E_ERROR, "unable to read djvu, if the File: exists, please retry") data = align.do_match(text, cached_text, djvuname, number, verbose=False, prefix=prefix, step=step) if not data['error']: safe_put(page, head + data['text'], user + ": match") data['text'] = "" return data
def do_split(mysite, rootname, user, codelang): prefix = page_prefixes['wikisource'].get(codelang) if not prefix: return ret_val(E_ERROR, "no Page: prefix") prefix = prefix.decode('utf-8') try: page = pywikibot.Page(mysite, rootname) text = page.get() except: return ret_val(E_ERROR, "unable to read page") p = re.compile('==\[\[(' + prefix + ':[^=]+)\]\]==\n') bl = p.split(text) titles = '\n' group = "" fromsection = "" tosection = "" fromsection_page = tosection_page = None for i in range(len(bl) / 2): title = bl[i * 2 + 1] content = bl[i * 2 + 2] #for illegalChar in ['#', '<', '>', '[', ']', '|', '{', '}', '\n', u'\ufffd']: # if illegalChar in title: # title = title.replace(illegalChar,'_') #always NOPREFIX pagetitle = title content = content.rstrip("\n ") pl = pywikibot.Page(mysite, pagetitle) m = re.match(prefix + ':(.*?)/(\d+)', pagetitle) if m: filename = m.group(1) pagenum = int(m.group(2)) if not group: group = filename pfrom = pagenum pto = pfrom else: if filename != group: titles = titles + "<pages index=\"%s\" from=%d to=%d />\n" % ( group, pfrom, pto) group = filename pfrom = pagenum pto = pfrom elif pagenum != pto + 1: titles = titles + "<pages index=\"%s\" from=%d to=%d />\n" % ( group, pfrom, pto) group = filename pfrom = pagenum pto = pfrom else: pto = pagenum else: if group: titles = titles + "<pages index=\"%s\" from=%d to=%d />\n" % ( group, pfrom, pto) group = False titles = titles + "{{" + pagetitle + "}}\n" #prepend br if content and content[0] == '\n': content = '<nowiki />\n' + content if pl.exists(): old_text = pl.get() refs = pl.getReferences(onlyTemplateInclusion=True) numrefs = 0 for ref in refs: numrefs += 1 #first and last pages : check if they are transcluded if numrefs > 0: m = re.match( "<noinclude>(.*?)</noinclude>(.*)<noinclude>(.*?)</noinclude>", old_text, re.MULTILINE | re.DOTALL) if m and (i == 0 or i == (len(bl) / 2 - 1)): print "creating sections" old_text = m.group(2) if i == 0: first_part = old_text second_part = content fromsection = "fromsection=s2 " fromsection_page = ref else: first_part = content second_part = old_text tosection = "tosection=s1 " tosection_page = ref content = "<noinclude>"+m.group(1)+"</noinclude><section begin=s1/>"+first_part+"<section end=s1/>\n----\n" \ + "<section begin=s2/>"+second_part+"<section end=s2/><noinclude>"+m.group(3)+"</noinclude>" else: m = re.match( "<noinclude><pagequality level=\"1\" user=\"(.*?)\" />(.*?)</noinclude>(.*)<noinclude>(.*?)</noinclude>", old_text, re.MULTILINE | re.DOTALL) if m: print "ok, quality 1, first try" content = "<noinclude><pagequality level=\"1\" user=\"" + m.group( 1) + "\" />" + m.group( 2 ) + "</noinclude>" + content + "<noinclude>" + m.group( 4) + "</noinclude>" m2 = re.match( "<noinclude>\{\{PageQuality\|1\|(.*?)\}\}(.*?)</noinclude>(.*)<noinclude>(.*?)</noinclude>", old_text, re.MULTILINE | re.DOTALL) if m2: # FIXME: shouldn't use an hardcoded name here print "ok, quality 1, second try" content = "<noinclude><pagequality level=\"1\" user=\"Phe-bot\" />" + m2.group( 2 ) + "</noinclude>" + content + "<noinclude>" + m2.group( 4) + "</noinclude>" else: header = u'<noinclude><pagequality level="1" user="******" /><div class="pagetext">\n\n\n</noinclude>' footer = u'<noinclude>\n<references/></div></noinclude>' content = header + content + footer safe_put(pl, content, user + ": split") if group: titles = titles + "<pages index=\"%s\" from=%d to=%d %s%s/>\n" % ( group, pfrom, pto, fromsection, tosection) if fromsection and fromsection_page: rtext = fromsection_page.get() m = re.search( "<pages index=\"(.*?)\" from=(.*?) to=(.*?) (fromsection=s2 |)/>", rtext) if m and m.group(1) == group: rtext = rtext.replace(m.group(0), m.group(0)[:-2] + "tosection=s1 />") print "new rtext" safe_put(fromsection_page, rtext, user + ": split") if tosection and tosection_page: rtext = tosection_page.get() m = re.search( "<pages index=\"(.*?)\" from=(.*?) to=(.*?) (tosection=s1 |)/>", rtext) if m and m.group(1) == group: rtext = rtext.replace(m.group(0), m.group(0)[:-2] + "fromsection=s2 />") print "new rtext" safe_put(tosection_page, rtext, user + ": split") header = bl[0] safe_put(page, header + titles, user + ": split") return ret_val(E_OK, "")
def do_match(mysite, maintitle, user, codelang): prefix = page_prefixes['wikisource'].get(codelang) if not prefix: return ret_val(E_ERROR, "no prefix") page = pywikibot.Page(mysite, maintitle) try: text = page.get() except: utils.print_traceback("failed to get page") return ret_val(E_ERROR, "failed to get page") if text.find("{{R2Mondes")!=-1: global pl_dict pl_dict = {} p0 = re.compile("\{\{R2Mondes\|(\d+)\|(\d+)\|(\d+)\}\}\s*\n") try: new_text = p0.sub(repl, text) except pywikibot.NoPage: return ret_val(E_ERROR, "Erreur : impossible de trouver l'index") p = re.compile('==\[\[Page:([^=]+)\]\]==\n') cache = lifo_cache.LifoCache('match_and_split_text_layer') bl= p.split(new_text) for i in range(len(bl)/2): title = bl[i*2+1] content = bl[i*2+2] filename, pagenum = title.split('/') if i == 0: cached_text = align.get_djvu(cache, mysite, filename, True) else: cached_text = align.get_djvu(cache, mysite, filename, False) if not cached_text: return ret_val(E_ERROR, "Erreur : fichier absent") if content.find("R2Mondes") != -1: p0 = re.compile("\{\{R2Mondes\|\d+\|\d+\|(\d+)\}\}\s*\n") bl0 = p0.split(text) title0 = bl0[i*2+1].encode("utf8") return ret_val(E_ERROR, "Erreur : Syntaxe 'R2Mondes' incorrecte, dans la page "+title0) r = align.match_page(content, cached_text[int(pagenum)-1]) print "%s %s : %f"%(filename, pagenum, r) if r < 0.1: return ret_val(E_ERROR, "Erreur : Le texte ne correspond pas, page %s" % pagenum) #the page is ok new_text = re.sub(u'<references[ ]*/>', u'', new_text) new_text = re.sub(u'[ ]([,])', u'\\1', new_text) new_text = re.sub(u'([^.])[ ]([,.])', u'\\1\\2', new_text) new_text = re.sub(u'\.\.\.', u'…', new_text) new_text = re.sub(u'([^ \s])([;:!?])', u'\\1 \\2', new_text) new_text = re.sub(u'([«;:!?])([^ \s…])', u'\\1 \\2', new_text) # separated from the previous regexp else "word!»" overlap new_text = re.sub(u'([^ \s])([»])', u'\\1 \\2', new_text) # workaround some buggy text new_text = re.sub(u'([;:!?»]) \n', u'\\1\n', new_text) new_text = re.sub(u'([;:!?»])\'\'([ \n])', u'\\1\'\'\\2', new_text) # < ><space> #new_text = re.sub(u' ([;:!?»])', u' \\1', new_text) #new_text = re.sub(u' ([;:!?»])', u' \\1', new_text) new_text = re.sub(u'([;:!?»]) <br />', u'\\1<br />', new_text) new_text = new_text.replace(u'Page : ', u'Page:') new_text = new_text.replace(u'\n: ', u'\n:') new_text = new_text.replace(u'\n:: ', u'\n::') new_text = new_text.replace(u'\n::: ', u'\n:::') new_text = new_text.replace(u'\n:::: ', u'\n::::') new_text = new_text.replace(u'\n::::: ', u'\n:::::') new_text = re.sub(u'1er (janvier|février|avril|mars|mai|juin|juillet|août|septembre|octobre|novembre|décembre)', u'1{{er}} \\1', new_text) new_text = re.sub(u'([0-9])e ', u'\\1{{e}} ', new_text) #text = re.sub(u'([;:!?»]) <div>\n', u'\\1\n', new_text) # try to move the title inside the M&S match_title = re.search(u"{{[Jj]ournal[ ]*\|*(.*?)\|", new_text) if match_title: pos = re.search(u'==(.*?)==', new_text) if pos: new_text = new_text[0:pos.end(0)] + u'\n{{c|' + match_title.group(1) + u'|fs=140%}}\n\n\n' + new_text[pos.end(0):] safe_put(page,new_text,user+": match") jobs['number_of_split_job'] += 1 # FIXME: can we pass the request here and use a callback in the js? # FIXME: server is None? jobs['split_queue'].put(maintitle, codelang, user, time.time(), None, None, None) # FIXME: that's an abuse of E_ERROR return ret_val(E_ERROR, "ok : transfert en cours.") prefix = prefix.decode('utf-8') p = re.compile("==__MATCH__:\[\[" + prefix + ":(.*?)/(\d+)(\|step=(\d+))?\]\]==") m = re.search(p,text) if m: djvuname = m.group(1) number = m.group(2) pos = text.find(m.group(0)) head = text[:pos] text = text[pos+len(m.group(0)):] if m.group(4): try: step = int(m.group(4)) except: return ret_val(E_ERROR, "match tag invalid") else: step = 1 else: return ret_val(E_ERROR, "match tag not found") pywikibot.output(djvuname + " " + number + " " + str(step)) try: number = int(number) except: return ret_val(E_ERROR, "illformed __MATCH__: no page number ?") cache = lifo_cache.LifoCache('match_and_split_text_layer') cached_text = align.get_djvu(cache, mysite, djvuname, True) if not cached_text: return ret_val(E_ERROR, "unable to read djvu, if the File: exists, please retry") data = align.do_match(text, cached_text, djvuname, number, verbose = False, prefix = prefix, step = step) if not data['error']: safe_put(page, head + data['text'], user + ": match") data['text'] = "" return data
def do_split(mysite, rootname, user, codelang): prefix = page_prefixes['wikisource'].get(codelang) if not prefix: return ret_val(E_ERROR, "no Page: prefix") prefix = prefix.decode('utf-8') try: page = pywikibot.Page(mysite, rootname) text = page.get() except: return ret_val(E_ERROR, "unable to read page") p = re.compile('==\[\[(' + prefix + ':[^=]+)\]\]==\n') bl = p.split(text) titles = '\n' group = "" fromsection = "" tosection = "" fromsection_page = tosection_page = None for i in range(len(bl)/2): title = bl[i*2+1] content = bl[i*2+2] #for illegalChar in ['#', '<', '>', '[', ']', '|', '{', '}', '\n', u'\ufffd']: # if illegalChar in title: # title = title.replace(illegalChar,'_') #always NOPREFIX pagetitle = title content = content.rstrip("\n ") pl = pywikibot.Page(mysite, pagetitle) m = re.match(prefix + ':(.*?)/(\d+)', pagetitle) if m: filename = m.group(1) pagenum = int(m.group(2)) if not group: group = filename pfrom = pagenum pto = pfrom else: if filename != group: titles = titles + "<pages index=\"%s\" from=%d to=%d />\n"%(group,pfrom,pto) group = filename pfrom = pagenum pto = pfrom elif pagenum != pto + 1: titles = titles + "<pages index=\"%s\" from=%d to=%d />\n"%(group,pfrom,pto) group = filename pfrom = pagenum pto = pfrom else: pto = pagenum else: if group: titles = titles + "<pages index=\"%s\" from=%d to=%d />\n"%(group,pfrom,pto) group = False titles = titles + "{{"+pagetitle+"}}\n" #prepend br if content and content[0]=='\n': content = '<nowiki />\n'+content if pl.exists(): old_text = pl.get() refs = pl.getReferences(onlyTemplateInclusion = True) numrefs = 0 for ref in refs: numrefs += 1 #first and last pages : check if they are transcluded if numrefs > 0 : m = re.match("<noinclude>(.*?)</noinclude>(.*)<noinclude>(.*?)</noinclude>",old_text,re.MULTILINE|re.DOTALL) if m and (i == 0 or i == (len(bl)/2 -1)): print "creating sections" old_text = m.group(2) if i == 0: first_part = old_text second_part = content fromsection="fromsection=s2 " fromsection_page = ref else: first_part = content second_part = old_text tosection="tosection=s1 " tosection_page = ref content = "<noinclude>"+m.group(1)+"</noinclude><section begin=s1/>"+first_part+"<section end=s1/>\n----\n" \ + "<section begin=s2/>"+second_part+"<section end=s2/><noinclude>"+m.group(3)+"</noinclude>" else: m = re.match("<noinclude><pagequality level=\"1\" user=\"(.*?)\" />(.*?)</noinclude>(.*)<noinclude>(.*?)</noinclude>", old_text,re.MULTILINE|re.DOTALL) if m : print "ok, quality 1, first try" content = "<noinclude><pagequality level=\"1\" user=\"" + m.group(1) + "\" />"+m.group(2)+"</noinclude>"+content+"<noinclude>"+m.group(4)+"</noinclude>" m2 = re.match("<noinclude>\{\{PageQuality\|1\|(.*?)\}\}(.*?)</noinclude>(.*)<noinclude>(.*?)</noinclude>", old_text,re.MULTILINE|re.DOTALL) if m2 : # FIXME: shouldn't use an hardcoded name here print "ok, quality 1, second try" content = "<noinclude><pagequality level=\"1\" user=\"Phe-bot\" />"+m2.group(2)+"</noinclude>"+content+"<noinclude>"+m2.group(4)+"</noinclude>" else: header = u'<noinclude><pagequality level="1" user="******" />\n\n\n</noinclude>' footer = u'<noinclude>\n<references/></div></noinclude>' content = header + content + footer do_put = True if pl.exists(): if hasattr(pl, '_quality') and pl._quality != 1: print "quality != 1, not saved" do_put = False else: print "can't get quality level" if do_put: safe_put(pl,content,user+": split") if group: titles = titles + "<pages index=\"%s\" from=%d to=%d %s%s/>\n"%(group,pfrom,pto,fromsection,tosection) if fromsection and fromsection_page: rtext = fromsection_page.get() m = re.search("<pages index=\"(.*?)\" from=(.*?) to=(.*?) (fromsection=s2 |)/>",rtext) if m and m.group(1)==group: rtext = rtext.replace(m.group(0), m.group(0)[:-2]+"tosection=s1 />" ) print "new rtext" safe_put(fromsection_page,rtext,user+": split") if tosection and tosection_page: rtext = tosection_page.get() m = re.search("<pages index=\"(.*?)\" from=(.*?) to=(.*?) (tosection=s1 |)/>",rtext) if m and m.group(1)==group: rtext = rtext.replace(m.group(0), m.group(0)[:-2]+"fromsection=s2 />" ) print "new rtext" safe_put(tosection_page,rtext,user+": split") header = bl[0] safe_put(page,header+titles,user+": split") return ret_val(E_OK, "")