def do_process_text_on_page(index, pagename, text, adj): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagename, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagename, txt)) pagemsg("Processing") notes = [] if "==Etymology 1==" in text or "==Pronunciation 1==" in text: pagemsg("WARNING: Saw Etymology/Pronunciation 1, can't handle yet") return parsed = blib.parse_text(text) headword = None for t in parsed.filter_templates(): tn = tname(t) if tn in (adj and ["bg-adj"] or ["bg-noun", "bg-proper noun"]): headword = getparam(t, "1") if (tn == "bg-decl-adj" if adj else tn.startswith("bg-noun-")): origt = unicode(t) if not headword: pagemsg("WARNING: Saw %s without {{%s}} headword" % (origt, "bg-adj" if adj else "bg-noun")) continue del t.params[:] t.add("1", "%s<>" % headword) blib.set_template_name(t, "bg-adecl" if adj else "bg-ndecl") pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append("convert {{%s}} to {{%s}}" % (tn, tname(t))) return text, notes
def process_text_on_page(index, pagename, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagename, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagename, txt)) pagemsg("Processing") notes = [] parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) if tn == "head" and getparam(t, "1") == "la": pos = getparam(t, "2") if pos not in pos_to_template: pagemsg("WARNING: Saw unrecognized part of speech %s: %s" % (pos, unicode(t))) continue if getparam(t, "3") or getparam(t, "head"): pagemsg("WARNING: Saw 3= or head=: %s" % unicode(t)) continue origt = unicode(t) t.add("1", pagename) blib.set_template_name(t, pos_to_template[pos]) rmparam(t, "2") t.add("FIXME", "1") pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append("replace {{head|la|%s}} with {{%s}}" % (pos, tname(t))) return unicode(parsed), notes
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errpagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) errmsg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") notes = [] for t in parsed.filter_templates(): origt = unicode(t) if tname(t) in [ "ru-conj", "ru-conj-old", "User:Benwing2/ru-conj", "User:Benwing2/ru-conj-old" ]: t.add("1", getparam(t, "1").replace("-refl", "")) elif tname(t) == "temp" and getparam(t, "1") == "ru-conj": t.add("2", getparam(t, "2").replace("-refl", "")) newt = unicode(t) if origt != newt: notes.append("remove -refl from verb type") pagemsg("Replaced %s with %s" % (origt, newt)) return parsed, notes
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") notes = [] for t in parsed.filter_templates(): tn = tname(t) if tn == "la-decl-multi": t = convert_la_decl_multi_to_new(t, pagetitle, pagemsg, errandpagemsg) if t: notes.append("converted {{la-decl-multi}} to {{%s}}" % tname(t)) else: return None, None elif tn in old_la_noun_decl_templates: if convert_template_to_new(t, pagetitle, pagemsg, errandpagemsg): notes.append("converted {{%s}} to {{la-ndecl}}" % tn) else: return None, None return unicode(parsed), notes
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") notes = [] def frob(t, param): val = getparam(t, param) if val: newval = val.replace(u"\u02C1", u"\u02E4") if newval != val: t.add(param, newval) for t in parsed.filter_templates(): origt = unicode(t) if tname(t) == "IPAchar": frob(t, "1") elif tname(t) == "IPA": if getparam(t, "lang"): firstparam = 1 else: firstparam = 2 for i in range(firstparam, 20): frob(t, str(i)) newt = unicode(t) if origt != newt: notes.append( "Correct use of U+02C1 pharyngealization mark to U+02E4") pagemsg("Replaced %s with %s" % (origt, newt)) return parsed, notes
def process_lemma_page(page, index, form): global args pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) notes = [] parsed = blib.parse_text(text) it_adj_template = None it_part_template = None for t in parsed.filter_templates(): tn = tname(t) if tn == "it-adj": if it_adj_template: pagemsg( "WARNING: Saw multiple adjective headword templates in subsection, %s and %s, skipping" % (unicode(it_adj_template), unicode(t))) return it_adj_template = t if tn == "it-pp": if it_part_template: pagemsg( "WARNING: Saw multiple adjective headword templates in subsection, %s and %s, skipping" % (unicode(it_part_template), unicode(t))) return it_part_template = t if not it_adj_template and not it_part_template: pagemsg("WARNING: Didn't see adjective or participle lemma template") return None, None if it_part_template: if it_adj_template: pagemsg( "WARNING: Saw both %s and %s, choosing adjective template" % (unicode(it_adj_template), unicode(it_part_template))) template = it_adj_template else: template = it_part_template else: template = it_adj_template if getparam(template, "sup"): pagemsg("Already saw sup=: %s" % unicode(template)) else: origt = unicode(template) template.add("sup", form) pagemsg("Replaced %s with %s" % (origt, unicode(template))) notes.append("add sup=%s to {{%s}}" % (form, tname(template))) return unicode(parsed), notes
def process_text_on_page(index, pagetitle, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] origtext = text parsed = blib.parse_text(text) head = None for t in parsed.filter_templates(): tn = tname(t) newhead = None if tn == "head" and getparam(t, "1") == "ang" or tn in [ "ang-noun", "ang-noun-form", "ang-verb", "ang-verb-form", "ang-adj", "ang-adj-form", "ang-adv", "ang-con", "ang-prep", "ang-prefix", "ang-proper noun", "ang-suffix" ]: newhead = getparam(t, "head") or pagetitle if newhead: if head: pagemsg("WARNING: Saw head=%s and newhead=%s, skipping" % (head, newhead)) return head = newhead if u"ƿ" not in head: pagemsg("WARNING: Something wrong, didn't see wynn in head: %s" % head) saw_altspell = None for t in parsed.filter_templates(): tn = tname(t) if tn == "alternative spelling of": if saw_altspell: pagemsg( "WARNING: Saw multiple {{alternative spelling of}}, skipping: %s and %s" % (unicode(saw_altspell), unicode(t))) return saw_altspell = unicode(t) if getparam(t, "1") != "ang": pagemsg( "WARNING: {{alternative spelling of}} without language 'ang', skipping: %s" % unicode(t)) return param2 = getparam(t, "2") should_param2 = blib.remove_links(head).replace(u"ƿ", "w") if param2 != should_param2: origt = unicode(t) t.add("2", should_param2) pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append( "fix 2= in {{alternative spelling of}} in wynn Old English entries" ) text = re.sub("\n\n+", "\n\n", unicode(parsed)) if origtext != text and not notes: notes.append("condense 3+ newlines to 2") return text, notes
def insert_into_existing_pron_section(k): parsed = blib.parse_text(subsections[k]) for t in parsed.filter_templates(): tn = tname(t) if tn in pronun_templates: pagemsg("Already saw pronunciation template: %s" % unicode(t)) break else: # no break new_pron_template, pron_prefix = construct_new_pron_template() # Remove existing rhymes/hyphenation/pl-IPA lines for template in ["rhyme|pl", "rhymes|pl", "pl-IPA", "hyph|pl", "hyphenation|pl"]: re_template = template.replace("|", r"\|") regex = r"^([* ]*\{\{%s(?:\|[^{}]*)*\}\}\n)" % re_template m = re.search(regex, subsections[k], re.M) if m: pagemsg("Removed existing %s" % m.group(1).strip()) notes.append("remove existing {{%s}}" % template) subsections[k] = re.sub(regex, "", subsections[k], 0, re.M) for template in ["audio|pl"]: re_template = template.replace("|", r"\|") regex = r"^([* ]*\{\{%s(?:\|[^{}]*)*\}\}\n)" % re_template all_audios = re.findall(regex, subsections[k], re.M) if len(all_audios) > 1: pagemsg("WARNING: Saw multiple {{audio}} templates, skipping: %s" % ",".join(x.strip() for x in all_audios())) return if len(all_audios) == 1: audiot = list(blib.parse_text(all_audios[0].strip()).filter_templates())[0] assert(tname(audiot) == "audio") if getparam(audiot, "1") != "pl": pagemsg("WARNING: Wrong language in {{audio}}, skipping: %s" % audio_line) return audiofile = getparam(audiot, "2") audiogloss = getparam(audiot, "3") for param in audiot.params: pn = pname(param) pv = unicode(param.value) if pn not in ["1", "2", "3"]: pagemsg("WARNING: Unrecognized param %s=%s in {{audio}}, skipping: %s" % ( pn, pv, audio_line)) return if audiogloss in ["Audio", "audio"]: audiogloss = "" params = "|a=%s" % audiofile if audiogloss: params += "|ac=%s" % audiogloss new_pron_template = new_pron_template[:-2] + params + new_pron_template[-2:] pagemsg("Removed existing %s in order to incorporate into {{pl-p}}" % all_audios[0].strip()) notes.append("incorporate existing {{%s}} into {{pl-p}}" % template) subsections[k] = re.sub(regex, "", subsections[k], 0, re.M) subsections[k] = pron_prefix + new_pron_template + "\n" + subsections[k] notes.append("insert %s into existing Pronunciation section" % new_pron_template) return True
def hi_lemma_is_indeclinable(t, pagetitle, pagemsg): if tname(t) in ["hi-noun", "hi-proper noun"]: return not not getparam(t, "ind") if tname(t) == "hi-adj": if getparam(t, "ind"): return True pagename = blib.remove_links(getparam(t, "head") or pagetitle) # If the lemma doesn't end with any of the declinable suffixes, it's # definitely indeclinable. Some indeclinable adjectives end with these # same suffixes, but we have no way to know that these are indeclinable, # so assume declinable. return not (pagename.endswith(AA) or pagename.endswith(IND_AA) or pagename.endswith(AA + M)) return False
def uk_lemma_is_indeclinable(t, pagetitle, pagemsg): if tname(t) in ["uk-noun", "uk-proper noun"]: if getparam(t, "3") == "-": return True headword = getparam(t, "1") if headword and headword == getparam( t, "3") and (not re.search(u"я́?$", headword) or not getparam(t, "2").startswith("n")): pagemsg("WARNING: Indeclinable noun not marked as such: %s" % unicode(t)) return True if tname(t) == "uk-adj" and getparam(t, "indecl"): return True return False
def add_category(secbody, sectail, pagemsg, notes, cat): separator = "" m = re.match(r"^(.*?\n)(\n*--+\n*)$", sectail, re.S) if m: sectail, separator = m.groups() if re.search(r"\[\[Category:%s(\||\])" % re.escape(cat), secbody + sectail): # Category already present pagemsg("Category 'Hungarian %s' already present" % cat) return secbody, sectail + separator parsed = blib.parse_text(secbody + sectail) for t in parsed.filter_templates(): if tname(t) in ["cln", "catlangname"] and getparam(t, "1") == "hu": for i in range(2, 30): if getparam(t, str(i)) == cat: # Category already present in templatized form pagemsg("Category 'Hungarian %s' already present" % cat) return secbody, sectail + separator # Now add the category to existing {{cln}}, or create one. parsed = blib.parse_text(sectail) for t in parsed.filter_templates(): if tname(t) in ["cln", "catlangname"] and getparam(t, "1") == "hu": for i in range(2, 30): if not getparam(t, str(i)): break else: # no break pagemsg( "WARNING: Something strange, reached 30= in %s and didn't see place to insert" % unicode(t)) return secbody, sectail + separator before = str(i + 1) if getparam( t, str(i + 1)) else "sort" if getparam(t, "sort") else None origt = unicode(t) t.add(str(i), cat, before=before) notes.append("insert '%s' into existing {{%s|hu}}" % (cat, tname(t))) pagemsg("Replaced %s with %s" % (origt, unicode(t))) return secbody, unicode(parsed) + separator # Need to create {{cln}}. newtext = "{{cln|hu|%s}}" % cat sectail = sectail.strip() if sectail: sectail = sectail + "\n" + newtext else: sectail = newtext notes.append("add %s" % newtext) pagemsg("Added %s" % newtext) return secbody.rstrip( "\n") + "\n", "\n" + sectail + "\n\n" + separator.lstrip("\n")
def put_back_new_inflection_of_params(t, notes, tags, params, lang, term, tr, alt, convert_to_more_specific_template=False): # Erase all params. del t.params[:] # Put back new params. # Strip comment continuations and line breaks. Such cases generally # have linebreaks after semicolons as well, but we remove those. # (FIXME, consider preserving them.) t.add("1", remove_comment_continuations(lang)) t.add("2", remove_comment_continuations(term)) tr = remove_comment_continuations(tr) if tr: t.add("tr", tr) if (convert_to_more_specific_template and tname(t) in generic_inflection_of_templates and tuple(tags) in tags_to_templates): tempname = tags_to_templates[tuple(tags)] old_tn = tname(t) # Convert to more specific template, e.g. {{plural of}}. blib.set_template_name(t, tempname) altparam = remove_comment_continuations(alt) if altparam: t.add("3", altparam) notes.append( "replace {{%s|%s|%s|...|%s}} with {{%s|%s|%s}}" % (old_tn, lang, term, "|".join(tags), tempname, lang, term)) else: t.add("3", remove_comment_continuations(alt)) next_tag_param = 4 # Put back the tags into the template and note stats on bad tags for tag in tags: t.add(str(next_tag_param), tag) next_tag_param += 1 # Finally, put back misc. tags. for pname, pval, showkey in params: t.add(pname, pval, showkey=showkey, preserve_spacing=False)
def etym_section_is_movable(sectext, header): parsed = blib.parse_text(sectext) inflection_of_templates_with_unrecognized_tags = [] saw_inflection_of_with_recognized_tag = False for t in parsed.filter_templates(): tn = tname(t) if tn == "inflection of": if getparam(t, "lang"): lang = getparam(t, "lang") first_tag_param = 3 else: lang = getparam(t, "1") first_tag_param = 4 if lang != "ar": pagemsg("WARNING: Non-Arabic language in Arabic {{inflection of}} in %s, skipping: %s" % (header, unicode(t))) return False tags = [] for param in t.params: pn = pname(param) pv = unicode(param.value).strip() if re.search("^[0-9]+$", pn) and int(pn) >= first_tag_param: tags.append(pv) if tags not in split_recognized_tag_sets: inflection_of_templates_with_unrecognized_tags.append(unicode(t)) else: saw_inflection_of_with_recognized_tag = True if not saw_inflection_of_with_recognized_tag: return False if inflection_of_templates_with_unrecognized_tags: pagemsg("WARNING: Unrecognized {{inflection of}} tag set mixed with recognized ones in %s, skipping: %s" % (header, " / ".join(inflection_of_templates_with_unrecognized_tags))) return False for t in parsed.filter_templates(): tn = tname(t) if tn in ["also", "ar-root", "nonlemma", "ar-IPA"]: continue if tn == "ar-verb-form": form = getparam(t, "1") if not form.endswith(u"و") and form.endswith(u"وْ"): pagemsg("WARNING: ar-verb-form form doesn't end with waw in %s with recognized {{inflection of}} tags, skipping: %s" % (header, unicode(t))) return False continue if tn != "inflection of": pagemsg("WARNING: Unrecognized template in %s with recognized {{inflection of}} tags, skipping: %s" % (header, unicode(t))) return False return True
def fix_up_section(sectext, warn_on_multiple_heads): parsed = blib.parse_text(sectext) heads = set() pronun_templates = [] for t in parsed.filter_templates(): tn = tname(t) if lalib.la_template_is_head(t): heads |= set(blib.remove_links(x) for x in lalib.la_get_headword_from_template(t, pagetitle, pagemsg)) elif tn == "la-IPA": pronun_templates.append(t) if len(heads) > 1: if warn_on_multiple_heads: pagemsg("WARNING: Found multiple possible heads, not modifying: %s" % ",".join(heads)) return sectext if len(heads) == 0: pagemsg("WARNING: Found no possible heads, not modifying: %s" % ",".join(heads)) return sectext newsectext = re.sub(r"\{\{a\|Classical\}\} \{\{IPA(char)?\|.*?\}\}", "{{la-IPA|%s}}" % list(heads)[0], sectext) newsectext = re.sub(r"^\* \{\{IPA(char)?\|.*?\|lang=la\}\}", "{{la-IPA|%s}}" % list(heads)[0], newsectext, 0, re.M) if newsectext != sectext: notes.append("replaced manual Latin pronun with {{la-IPA|%s}}" % list(heads)[0]) sectext = newsectext # Recompute pronun templates as we may have added one. parsed = blib.parse_text(sectext) pronun_templates = [] for t in parsed.filter_templates(): tn = tname(t) if tn == "la-IPA": pronun_templates.append(t) if "{{a|Ecclesiastical}} {{IPA" in sectext: if len(pronun_templates) == 0: pagemsg("WARNING: Found manual Ecclesiastical pronunciation but not {{la-IPA}} template") elif len(pronun_templates) > 1: pagemsg("WARNING: Found manual Ecclesiastical pronunciation and multiple {{la-IPA}} templates: %s" % ",".join(unicode(tt) for tt in pronun_templates)) else: origt = unicode(pronun_templates[0]) pronun_templates[0].add("eccl", "yes") pagemsg("Replaced %s with %s" % (origt, unicode(pronun_templates[0]))) newsectext = re.sub(r"^\* \{\{a\|Ecclesiastical\}\} \{\{IPA(char)?\|.*?\}\}\n", "", sectext, 0, re.M) if newsectext == sectext: pagemsg("WARNING: Unable to remove manual Ecclesiastical prounciation") else: notes.append("removed manual Ecclesiastical pronunciation and added |eccl=yes to {{la-IPA}}") sectext = newsectext return sectext
def process_text_on_page(index, pagetitle, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) global args def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) notes = [] pagemsg("Processing") parsed = blib.parse_text(text) for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) newarg1 = None if tn == "de-conj": generate_template = re.sub(r"^\{\{de-conj(?=[|}])", "{{User:Benwing2/de-generate-verb-props", unicode(t)) result = expand_text(generate_template) if not result: continue forms = blib.split_generate_args(result) pagemsg("For %s, class=%s" % (unicode(t), forms["class"])) if unicode(t) != origt: pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t))) return unicode(parsed), notes
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) if not args.stdin: pagemsg("Processing") retval = lalib.find_latin_section(text, pagemsg) if retval is None: return None, None sections, j, secbody, sectail, has_non_latin = retval parsed = blib.parse_text(secbody) for t in parsed.filter_templates(): tn = tname(t) if tn in lalib.la_headword_templates: for head in lalib.la_get_headword_from_template( t, pagetitle, pagemsg): no_macrons_head = remove_macrons(blib.remove_links(head)) if pagetitle.startswith("Reconstruction"): unprefixed_title = "*" + re.sub(".*/", "", pagetitle) else: unprefixed_title = pagetitle if no_macrons_head != unprefixed_title: pagemsg("WARNING: Bad Latin head: %s" % unicode(t)) return None, None
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("Skipping page with colon in pagetitle") return None, None notes = [] for t in parsed.filter_templates(): if tname(t) == "la-IPA": param1 = getparam(t, "1") newparam1 = re.sub(r"^(a[bd]|ob|sub)\.([lr])", r"\1\2", param1) if newparam1 != param1: origt = unicode(t) t.add("1", newparam1) pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append("remove unnecessary period in %s in {{la-IPA}}" % param1) return unicode(parsed), notes
def process_text_on_page_for_full_conj(index, pagename, text, verbs): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagename, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagename, txt)) pagemsg("Processing") notes = [] if pagename not in verbs: pagemsg("WARNING: Can't find entry, skipping") return entry = verbs[pagename] origentry = entry first, rest = pagename.split(" ", 1) restwords = rest.split(" ") def_link = "%s<> %s" % (first, " ".join("[[%s]]" % word for word in restwords)) if def_link == entry: pagemsg("Replacing entry '%s' with a blank entry because it's the default" % entry) entry = "" elif re.sub("<.*?>", "<>", entry) == def_link: newentry = blib.remove_links(entry) pagemsg("Replacing entry '%s' with entry without links '%s'" % (entry, newentry)) entry = newentry parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) if tn == "es-verb": if not getparam(t, "attn"): pagemsg("Didn't see attn=1: %s" % unicode(t)) continue rmparam(t, "attn") if entry: t.add("1", entry) notes.append("add conjugation '%s' to Spanish verb" % entry) else: notes.append("add conjugation (default) to Spanish verb") if tn == "head" and getparam(t, "1") == "es" and getparam(t, "2") == "verb": head = getparam(t, "head") if head: pagemsg("WARNING: Removing head=%s compared with entry '%s', original entry '%s': %s" % (head, entry, origentry, unicode(t))) rmparam(t, "head") rmparam(t, "2") rmparam(t, "1") blib.set_template_name(t, "es-verb") if entry: t.add("1", entry) notes.append("convert {{head|es|verb}} to {{es-verb|%s}}" % entry) else: notes.append("convert {{head|es|verb}} to {{es-verb}}") if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) return unicode(parsed), notes
def process_text_on_page_for_single_word(index, pagename, text, spec): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagename, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagename, txt)) pagemsg("Processing") notes = [] parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) if tn == "es-verb": if not getparam(t, "attn"): pagemsg("Didn't see attn=1: %s" % unicode(t)) continue rmparam(t, "attn") if "<" in spec: t.add("1", "%s%s" % (pagename, spec)) notes.append("add conjugation %s%s to Spanish verb" % (pagename, spec)) elif spec == "*": notes.append("add conjugation (default) to Spanish verb") else: t.add("pres", spec) notes.append("add conjugation pres=%s to Spanish verb" % spec) if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) return unicode(parsed), notes
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") notes = [] for t in parsed.filter_templates(): tn = tname(t) if tn == "la-decl-multi": pagemsg("Skipping la-decl-multi for now: %s" % unicode(t)) elif tn == "la-decl-irreg" and getparam(t, "noun"): pagemsg("Skipping noun la-decl-irreg: %s" % unicode(t)) elif tn in old_la_adj_decl_templates: if convert_template_to_new(t, pagetitle, pagemsg, errandpagemsg): notes.append("converted {{%s}} to {{la-adecl}}" % tn) else: return None, None return unicode(parsed), notes
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") notes = [] for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn == "rfdef": if getparam(t, "lang"): pagemsg("WARNING: has lang=, skipping: %s" % unicode(t)) continue lang = getparam(t, "1") if lang in langs_to_convert: newlang = langs_to_convert[lang] t.add("1", newlang) notes.append("convert {{rfdef|%s}} to {{rfdef|%s}}" % (lang, newlang)) lang = newlang if lang in langs_to_remove_sort: if t.has("sort"): rmparam(t, "sort") notes.append( "remove sort= from {{rfdef|%s}}, now auto-computed" % lang) if unicode(t) != origt: pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t))) return unicode(parsed), notes
def la_template_is_head(t): tn = tname(t) if tn in la_headword_templates: return True if tn == "head" and getparam(t, "1") == "la": return True return False
def process_text_on_page(index, pagetitle, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] if "es-IPA" not in text and "fr-IPA" not in text and "it-IPA" not in text: return parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) if tn in ["es-IPA", "fr-IPA", "it-IPA"]: must_continue = False for i in xrange(2, 11): if getparam(t, str(i)): pagemsg("Template has %s=, not touching: %s" % (i, origt)) must_continue = True break if must_continue: continue par1 = getparam(t, "1") if par1 == pagetitle: rmparam(t, "1") notes.append("remove redundant 1=%s from {{%s}}" % (par1, tn)) if unicode(t) != origt: pagemsg("Replaced %s with %s" % (origt, unicode(t))) return unicode(parsed), notes
def find_head_comp_sup(pagetitle, pagemsg): page = pywikibot.Page(site, pagetitle) text = unicode(page.text) parsed = blib.parse_text(text) for t in parsed.filter_templates(): if tname(t) == "la-adv": head = getparam(t, "1") comp = getparam(t, "comp") or getparam(t, "2") sup = getparam(t, "sup") or getparam(t, "3") if not comp or not sup: for suff in [ "iter", "nter", "ter", "er", u"iē", u"ē", "im", u"ō" ]: m = re.search("^(.*?)%s$" % suff, head) if m: stem = m.group(1) if suff == "nter": stem += "nt" default_comp = stem + "ius" default_sup = stem + u"issimē" break else: pagemsg( "WARNING: Didn't recognize ending of adverb headword %s" % head) return head, comp, sup comp = comp or default_comp sup = sup or default_sup return head, comp, sup return None, None, None
def replace_trans(m, newlangcode, newlangname): prefix, transtext = m.groups() parsed = blib.parse_text(transtext) for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn in trans_templates: if getparam(t, "1") == "ku": t.add("1", newlangcode) rmparam(t, "sc") pagemsg( "Replaced %s with %s based on language prefix of translation entry" % (origt, unicode(t))) notes.append( "{{%s|ku}} -> {{%s|%s}} based on language prefix of translation entry" % (tn, tn, newlangcode)) elif tn == "t-simple": if getparam(t, "1") == "ku": if getparam(t, "langname" != "Kurdish"): pagemsg( "WARNING: Something wrong, t-simple|ku without langname=Kurdish: %s" % unicode(t)) else: t.add("1", newlangcode) t.add("langname", newlangname) pagemsg("Replaced %s with %s based on prefix" % (origt, unicode(t))) notes.append( "{{t-simple|ku|langname=Kurdish}} -> {{t-simple|%s|langname=%s}} based on language prefix" % (newlangcode, newlangname)) transtext = unicode(parsed) return prefix + transtext
def process_page_for_fix(page, index, parsed): pagename = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagename, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagename, txt)) pagemsg("Processing") notes = [] text = unicode(page.text) newtext = re.sub(r"\[\[(.*?)\]\]", r"{{l|kmr|\1}}", text) if newtext != text: notes.append("convert raw links to {{l|kmr|...}}") text = newtext parsed = blib.parse_text(text) for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn in ["l", "rhymes nav"] and getparam(t, "1") == "ku": t.add("1", "kmr") notes.append("convert {{%s|ku}} to {{%s|kmr}}" % (tn, tn)) elif getparam(t, "1") == "ku": pagemsg("WARNING: Kurdish-language template of unrecognized name: %s" % unicode(t)) if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) text = unicode(parsed) return text, notes
def process_text_on_page(index, pagename, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagename, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagename, txt)) pagemsg("Processing") notes = [] parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) if tn == "RQ:Buk Baibel": param1 = getparam(t, "1") if param1 in book_map: t.add("1", book_map[param1]) notes.append("convert '%s' to '%s' in 1= in {{%s}}" % (param1, book_map[param1], tn)) param4 = getparam(t, "4") if param4: t.add("passage", param4, before="4") rmparam(t, "4") notes.append("4= -> passage= in {{%s}}" % tn) if unicode(t) != origt: pagemsg("Replaced %s with %s" % (origt, unicode(t))) return unicode(parsed), notes
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") notes = [] for t in parsed.filter_templates(): if tname(t) == "R:Lexico": origt = unicode(t) rmparam(t, "lang") entry_uk = getparam(t, "entry_uk") if entry_uk: t.add("entry", entry_uk, before="entry_uk") rmparam(t, "entry_uk") url_uk = getparam(t, "url_uk") if url_uk: t.add("url", url_uk, before="url_uk") rmparam(t, "url_uk") p4 = getparam(t, "4") if p4: t.add("text", p4, before="4") rmparam(t, "4") newt = unicode(t) if origt != newt: notes.append("Remove/rearrange params in {{R:Lexico}}") pagemsg("Replaced %s with %s" % (origt, newt)) return parsed, notes
def snarf_adj_accents(): for index, page in blib.cat_articles("Bulgarian adjectives"): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) parsed = blib.parse(page) for t in parsed.filter_templates(): if tname(t) == "bg-adj": adj = getparam(t, "1") if not adj: pagemsg("WARNING: Missing headword in adj: %s" % unicode(t)) continue if bglib.needs_accents(adj): pagemsg("WARNING: Adjective %s missing an accent: %s" % (adj, unicode(t))) continue unaccented_adj = bglib.remove_accents(adj) if unaccented_adj in adjs_to_accents and adjs_to_accents[ unaccented_adj] != adj: pagemsg( "WARNING: Two different accents possible for %s: %s and %s: %s" % (unaccented_adj, adjs_to_accents[unaccented_adj], adj, unicode(t))) adjs_to_accents[unaccented_adj] = adj
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") notes = [] text = unicode(page.text) parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) if tn == "head" and getparam(t, "1") == "ang" and getparam( t, "2") in ["adjective", "adjectives"]: pagemsg("WARNING: {{head}} for adjectives, should not occur: %s" % unicode(t)) elif tn == "ang-adj": if getparam(t, "1"): pagemsg("WARNING: 1= in ang-adj, should not occur: %s" % unicode(t)) else: head = getparam(t, "head") rmparam(t, "head") if head: t.add("1", head) notes.append("move head= to 1= in {{ang-adj}}") if unicode(t) != origt: pagemsg("Replaced %s with %s" % (origt, unicode(t))) return parsed, notes