def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) if blib.page_should_be_ignored(pagetitle): pagemsg("Skipping ignored page") return None, None def hack_templates(parsed, langname): for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn in request_templates: if getparam(t, "lang"): continue if langname and langname != "English": pagemsg( "WARNING: Would default to English but in %s section, skipping: %s" % (langname, origt)) continue notes.append("add lang=en for {{%s}} with missing lang code" % tn) rmparam(t, "lang") # in case it's blank # Fetch all params. params = [] for param in t.params: pname = unicode(param.name) params.append((pname, param.value, param.showkey)) # Erase all params. del t.params[:] newline = "\n" if "\n" in unicode(t.name) else "" t.add("lang", "en" + newline, preserve_spacing=False) # Put remaining parameters in order. for name, value, showkey in params: t.add(name, value, showkey=showkey, preserve_spacing=False) pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t))) pagemsg("Processing") text = unicode(page.text) notes = [] sections = re.split("(^==[^=]*==\n)", text, 0, re.M) for j in xrange(0, len(sections), 2): if j == 0: langname = None else: m = re.search("^==(.*)==\n$", sections[j - 1]) assert m langname = m.group(1) parsed = blib.parse_text(sections[j]) hack_templates(parsed, langname) sections[j] = unicode(parsed) newtext = "".join(sections) return newtext, notes
def process_page(page, index, parsed): pagetitle = unicode(page.title()) text = unicode(page.text) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") notes = [] if blib.page_should_be_ignored(pagetitle): pagemsg("WARNING: Page should be ignored") return None, None parsed = blib.parse_text(text) for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn in templates: infl_params = templates[tn] lang = getparam(t, "lang") if lang: has_lang = True term = getparam(t, "1") alt = getparam(t, "2") gloss = getparam(t, "3") else: has_lang = False lang = getparam(t, "1") term = getparam(t, "2") alt = getparam(t, "3") gloss = getparam(t, "4") params = [] for param in t.params: pname = unicode(param.name).strip() pval = unicode(param.value).strip() if pname in ["lang", "1", "2", "3"] or (pname == "4" and not has_lang): continue pagemsg("WARNING: Unrecognized param %s, skipping" % pname) return None, None # Erase all params. del t.params[:] # Put back new params. blib.set_template_name(t, "inflection of") t.add("1", lang) t.add("2", term) t.add("3", alt) for index, tag in enumerate(infl_params): t.add(str(index + 4), tag) if gloss: t.add("t", gloss) if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append("replace {{%s}} with {{inflection of}}" % tn) return unicode(parsed), notes
def process_page(page, index, parsed): pagetitle = unicode(page.title()) text = unicode(page.text) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") notes = [] if blib.page_should_be_ignored(pagetitle): pagemsg("WARNING: Page should be ignored") return None, None parsed = blib.parse_text(text) for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn == "doublet": params = [] for param in t.params: pname = unicode(param.name).strip() pval = unicode(param.value).strip() showkey = param.showkey if not pval: continue if pname == "3": pname = "alt1" showkey = True elif pname == "4": pname = "t1" showkey = True elif pname in [ "t", "gloss", "tr", "ts", "pos", "lit", "alt", "sc", "id", "g" ]: pname = pname + "1" elif pname in ["1", "2", "notext", "nocap", "nocat"]: pass else: pagemsg( "WARNING: Unrecognized param %s=%s in %s, skipping" % (pname, pval, origt)) break params.append((pname, pval, showkey)) else: # No break # Erase all params. del t.params[:] # Put back new params. for pname, pval, showkey in params: t.add(pname, pval, showkey=showkey, preserve_spacing=False) if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append("restructure {{doublet}} for new syntax") return unicode(parsed), notes
def process_text_on_page(index, pagetitle, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) #pagemsg("Processing") if blib.page_should_be_ignored(pagetitle): #pagemsg("WARNING: Page should be ignored") return sections = re.split("(^==[^=\n]+==\n)", text, 0, re.M) langs = [] for j in xrange(1, len(sections), 2): m = re.search("^==(.*)==$", sections[j]) langs.append(m.group(1)) pagemsg("Languages = %s" % ",".join(langs))
def process_text_on_page(pagetitle, index, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if blib.page_should_be_ignored(pagetitle): pagemsg("WARNING: Page should be ignored") return None, None parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) if tn == "form of": lang = getparam(t, "lang") if lang: form = getparam(t, "1") else: form = getparam(t, "2") form_of_forms[form] += 1
def process_text_on_page(index, pagetitle, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) #pagemsg("Processing") if blib.page_should_be_ignored(pagetitle): #pagemsg("WARNING: Page should be ignored") return if all(x not in text for x in inflection_of_templates): return subsections = re.split("(^==+[^=\n]+==+\n)", text, 0, re.M) for j in xrange(2, len(subsections), 2): for template in inflection_of_templates: if re.search( r"^[#*]+ \{\{%s.*\n[#*]+ \{\{%s.*" % (template, template), subsections[j], re.M): pagemsg("Found subsection with combinable %s:\n%s" % (template, subsections[j].strip()))
def process_text_on_page(index, pagetitle, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) #pagemsg("Processing") if blib.page_should_be_ignored(pagetitle): #pagemsg("WARNING: Page should be ignored") return if "inflection of" not in text: return parsed = blib.parse_text(text) templates_to_replace = [] for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn in ["inflection of"]: if getparam(t, "lang"): term_param = 1 else: term_param = 2 for param in t.params: pname = unicode(param.name).strip() pval = unicode(param.value).strip() if re.search("^[0-9]+$", pname): if int(pname) >= term_param + 2: if pval in ["and", "or", ";", ";<!--\n-->" ] or "/" in pval or "," in pval: pagemsg("Found template: %s" % origt) break return
def process_page(page, index, parsed): pagetitle = unicode(page.title()) text = unicode(page.text) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") notes = [] if blib.page_should_be_ignored(pagetitle): pagemsg("WARNING: Page should be ignored") return None, None def combine_doublets(m): first = blib.parse_text(m.group(1)) rest = blib.parse_text(m.group(2)) t1 = list(first.filter_templates())[0] if getparam(t1, "3") or getparam(t1, "4") or getparam(t1, "alt2") or getparam(t1, "alt3"): pagemsg("WARNING: Can't combine %s, first template already has multiple terms" % m.group(0)) return m.group(0) next_index = 2 lang = getparam(t1, "1") for t in rest.filter_templates(recursive=False): tlang = getparam(t, "1") if lang != tlang: pagemsg("WARNING: Lang %s in continuation template %s not same as lang %s in first template %s" % ( tlang, unicode(t), lang, unicode(t1))) return m.group(0) for param in t.params: pname = unicode(param.name).strip() pval = unicode(param.value).strip() if not pval: continue if pname == "2": t1.add(str(next_index + 1), pval) elif pname == "3": t1.add("alt%s" % next_index, pval) elif pname == "4": t1.add("t%s" % next_index, pval) elif pname in ["t", "gloss", "tr", "ts", "pos", "lit", "alt", "sc", "id", "g"]: t1.add("%s%s" % (pname, next_index), pval) elif pname in ["t1", "gloss1", "tr1", "ts1", "pos1", "lit1", "alt1", "sc1", "id1", "g1"]: t1.add("%s%s" % (pname[:-1], next_index), pval) elif pname in ["1", "notext", "nocap", "nocat"]: pass else: pagemsg("WARNING: Unrecognized param %s=%s in %s, skipping" % (pname, pval, unicode(t))) return m.group(0) next_index += 1 for param in ["notext", "nocap", "nocat"]: val = getparam(t1, param) rmparam(t1, param) if val: t1.add(param, val) newtext = unicode(t1) pagemsg("Replaced %s with %s" % (m.group(0), newtext)) return newtext newtext = re.sub(r"(\{\{doublet\|(?:[^{}\n]|\{\{[^{}\n]*\}\})*\}\})((?:(?:, *|,? *and *)\{\{(?:m|l|doublet)\|(?:[^{}\n]|\{\{[^{}\n]*\}\})*\}\})+)", combine_doublets, text) if newtext != text: notes.append("combine adjacent doublets") text = newtext return text, notes
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) if blib.page_should_be_ignored(pagetitle): pagemsg("Skipping ignored page") return None, None def hack_templates(parsed, langname, subsectitle, langnamecode=None, is_citation=False): if langname not in blib.languages_byCanonicalName: if not is_citation: langnamecode = None else: langnamecode = blib.languages_byCanonicalName[langname]["code"] for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn in ["citation", "citations"] and is_citation: langnamecode = getparam(t, "lang") elif tn in quote_templates: if getparam(t, "lang"): continue lang = getparam(t, "language") if lang: notes.append("Convert language=%s to lang=%s in %s" % (lang, lang, tn)) else: if subsectitle.startswith("Etymology") or subsectitle.startswith("Pronunciation"): pagemsg("WARNING: Found template in %s section for language %s, might be different language, skipping: %s" % ( subsectitle, langname, origt)) continue if not langnamecode: pagemsg("WARNING: Unrecognized language %s, unable to add language to %s" % (langname, tn)) continue if langnamecode == "en" and (getparam(t, "translation") or getparam(t, "t")): pagemsg("WARNING: Translation section in putative English quote, skipping: %s" % origt) continue if langnamecode == "mul": notes.append("infer lang=en for %s in Translingual section and add termlang=mul" % tn) else: notes.append("infer lang=%s for %s based on section it's in" % (langnamecode, tn)) rmparam(t, "language") # Fetch all params. params = [] for param in t.params: pname = unicode(param.name) params.append((pname, param.value, param.showkey)) # Erase all params. del t.params[:] if langnamecode == "mul": termlang = langnamecode langnamecode = "en" else: termlang = None # Put lang parameter. newline = "\n" if "\n" in unicode(t.name) else "" t.add("lang", langnamecode + newline, preserve_spacing=False) if termlang: t.add("termlang", termlang + newline, preserve_spacing=False) # Put remaining parameters in order. for name, value, showkey in params: t.add(name, value, showkey=showkey, preserve_spacing=False) pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t))) return langnamecode pagemsg("Processing") text = unicode(page.text) notes = [] sections = re.split("(^==[^=]*==\n)", text, 0, re.M) if not pagetitle.startswith("Citations"): for j in xrange(2, len(sections), 2): m = re.search("^==(.*)==\n$", sections[j - 1]) assert m langname = m.group(1) subsections = re.split("(^==.*==\n)", sections[j], 0, re.M) for k in xrange(2, len(subsections), 2): m = re.search("^===*(.*?)=*==\n$", subsections[k - 1]) assert m subsectitle = m.group(1) parsed = blib.parse_text(subsections[k]) hack_templates(parsed, langname, subsectitle) subsections[k] = unicode(parsed) sections[j] = "".join(subsections) else: # Citation section? langnamecode = None for j in xrange(0, len(sections), 2): if j == 0: langname = "Unknown" else: m = re.search("^==(.*)==\n$", sections[j - 1]) assert m langname = m.group(1) parsed = blib.parse_text(sections[j]) langnamecode = hack_templates(parsed, langname, "Unknown", langnamecode=langnamecode, is_citation=True) sections[j] = unicode(parsed) newtext = "".join(sections) return newtext, notes
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) if blib.page_should_be_ignored(pagetitle): pagemsg("Skipping ignored page") return None, "" def hack_templates(parsed, subsectitle): for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn in quote_templates: if not getparam(t, "nocat"): continue if getparam(t, "lang").strip() != "en": continue notes.append( "convert nocat=1 in lang=en Translingual section to termlang=mul" ) # Fetch all params. params = [] for param in t.params: pname = unicode(param.name) if pname.strip() != "nocat": params.append((pname, param.value, param.showkey)) # Erase all params. del t.params[:] # Put lang and termlang parameters. newline = "\n" if "\n" in unicode(t.name) else "" t.add("lang", "en" + newline, preserve_spacing=False) t.add("termlang", "mul" + newline, preserve_spacing=False) # Put remaining parameters in order. for name, value, showkey in params: t.add(name, value, showkey=showkey, preserve_spacing=False) pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t))) pagemsg("Processing") text = unicode(page.text) notes = [] sections = re.split("(^==[^=]*==\n)", text, 0, re.M) for j in xrange(2, len(sections), 2): m = re.search("^==(.*)==\n$", sections[j - 1]) assert m langname = m.group(1) if langname != "Translingual": continue subsections = re.split("(^==.*==\n)", sections[j], 0, re.M) for k in xrange(2, len(subsections), 2): m = re.search("^===*(.*?)=*==\n$", subsections[k - 1]) assert m subsectitle = m.group(1) parsed = blib.parse_text(subsections[k]) hack_templates(parsed, subsectitle) subsections[k] = unicode(parsed) sections[j] = "".join(subsections) newtext = "".join(sections) return newtext, notes
def process_page(page, index, parsed, lang_in_1): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) if blib.page_should_be_ignored(pagetitle): pagemsg("Skipping ignored page") return None, None langparam = "1" if lang_in_1 else "lang" def hack_templates(parsed, langname, langnamecode=None, is_citation=False): if langname not in blib.languages_byCanonicalName: if not is_citation: langnamecode = None else: langnamecode = blib.languages_byCanonicalName[langname]["code"] for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn in ["citation", "citations"] and is_citation: langnamecode = getparam(t, "lang") or getparam(t, "1") if tn in templates_to_process: if getparam(t, langparam): pass elif not langnamecode: pagemsg( "WARNING: Unrecognized language %s, unable to add language to %s" % (langname, origt)) else: notes.append( "infer %s=%s for {{%s}} based on section it's in" % (langparam, langnamecode, tn)) newline = "\n" if "\n" in unicode(t.name) else "" if langparam == "1": if t.has("lang"): pagemsg( "WARNING: Template has lang=, removing: %s" % origt) notes.append("remove lang= from {{%s}}" % tn) rmparam(t, "lang") t.add(langparam, langnamecode + newline, preserve_spacing=False) else: # Fetch all params. params = [] for param in t.params: pname = unicode(param.name) params.append((pname, param.value, param.showkey)) # Erase all params. del t.params[:] t.add(langparam, langnamecode + newline, preserve_spacing=False) # Put remaining parameters in order. for name, value, showkey in params: t.add(name, value, showkey=showkey, preserve_spacing=False) if tn in templates_to_rename: blib.set_template_name(t, templates_to_rename[tn]) notes.append("rename {{%s}} to {{%s}}" % (tn, templates_to_rename[tn])) newt = unicode(t) if newt != origt: pagemsg("Replaced <%s> with <%s>" % (origt, newt)) return langnamecode pagemsg("Processing") text = unicode(page.text) notes = [] sections = re.split("(^==[^=]*==\n)", text, 0, re.M) if not pagetitle.startswith("Citations"): for j in xrange(2, len(sections), 2): m = re.search("^==(.*)==\n$", sections[j - 1]) assert m langname = m.group(1) parsed = blib.parse_text(sections[j]) hack_templates(parsed, langname) sections[j] = unicode(parsed) else: # Citation section? langnamecode = None for j in xrange(0, len(sections), 2): if j == 0: langname = "Unknown" else: m = re.search("^==(.*)==\n$", sections[j - 1]) assert m langname = m.group(1) parsed = blib.parse_text(sections[j]) langnamecode = hack_templates(parsed, langname, langnamecode=langnamecode, is_citation=True) sections[j] = unicode(parsed) newtext = "".join(sections) return newtext, notes
def process_text_on_page(pagetitle, index, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") notes = [] if blib.page_should_be_ignored(pagetitle): pagemsg("WARNING: Page should be ignored") return None, None parsed = blib.parse_text(text) templates_to_replace = [] for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn == "inflection of": params = [] if getparam(t, "lang"): lang = getparam(t, "lang") term_param = 1 notes.append("moved lang= in {{%s}} to 1=" % tn) else: lang = getparam(t, "1") term_param = 2 tr = getparam(t, "tr") term = getparam(t, str(term_param)) alt = getparam(t, "alt") or getparam(t, str(term_param + 1)) tags = [] for param in t.params: pname = unicode(param.name).strip() pval = unicode(param.value).strip() if re.search("^[0-9]+$", pname): if int(pname) >= term_param + 2: if pval: tags.append(pval) else: notes.append("removed empty tags from {{%s}}" % tn) elif pname not in ["lang", "tr", "alt"]: params.append((pname, pval, param.showkey)) if lang == "pl": newtags = ["nv" if tag == "other" else tag for tag in tags] if tags != newtags: notes.append( "replaced 'other' with 'nv' in Polish {{%s}}" % tn) tags = newtags # Erase all params. del t.params[:] # Put back new params. # Strip comment continuations and line breaks. Such cases generally have linebreaks after semicolons # as well, but we remove those. (FIXME, consider preserving them.) t.add("1", remove_comment_continuations(lang)) t.add("2", remove_comment_continuations(term)) tr = remove_comment_continuations(tr) if tr: t.add("tr", tr) t.add("3", remove_comment_continuations(alt)) next_tag_param = 4 # Put back the tags into the template and note stats on bad tags for tag in tags: t.add(str(next_tag_param), tag) next_tag_param += 1 for pname, pval, showkey in params: t.add(pname, pval, showkey=showkey, preserve_spacing=False) if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) return unicode(parsed), notes