def insert_into_existing_pron_section(k): parsed = blib.parse_text(subsections[k]) for t in parsed.filter_templates(): tn = tname(t) if tn in pronun_templates: pagemsg("Already saw pronunciation template: %s" % unicode(t)) break else: # no break new_pron_template, pron_prefix = construct_new_pron_template() # Remove existing rhymes/hyphenation/pl-IPA lines for template in ["rhyme|pl", "rhymes|pl", "pl-IPA", "hyph|pl", "hyphenation|pl"]: re_template = template.replace("|", r"\|") regex = r"^([* ]*\{\{%s(?:\|[^{}]*)*\}\}\n)" % re_template m = re.search(regex, subsections[k], re.M) if m: pagemsg("Removed existing %s" % m.group(1).strip()) notes.append("remove existing {{%s}}" % template) subsections[k] = re.sub(regex, "", subsections[k], 0, re.M) for template in ["audio|pl"]: re_template = template.replace("|", r"\|") regex = r"^([* ]*\{\{%s(?:\|[^{}]*)*\}\}\n)" % re_template all_audios = re.findall(regex, subsections[k], re.M) if len(all_audios) > 1: pagemsg("WARNING: Saw multiple {{audio}} templates, skipping: %s" % ",".join(x.strip() for x in all_audios())) return if len(all_audios) == 1: audiot = list(blib.parse_text(all_audios[0].strip()).filter_templates())[0] assert(tname(audiot) == "audio") if getparam(audiot, "1") != "pl": pagemsg("WARNING: Wrong language in {{audio}}, skipping: %s" % audio_line) return audiofile = getparam(audiot, "2") audiogloss = getparam(audiot, "3") for param in audiot.params: pn = pname(param) pv = unicode(param.value) if pn not in ["1", "2", "3"]: pagemsg("WARNING: Unrecognized param %s=%s in {{audio}}, skipping: %s" % ( pn, pv, audio_line)) return if audiogloss in ["Audio", "audio"]: audiogloss = "" params = "|a=%s" % audiofile if audiogloss: params += "|ac=%s" % audiogloss new_pron_template = new_pron_template[:-2] + params + new_pron_template[-2:] pagemsg("Removed existing %s in order to incorporate into {{pl-p}}" % all_audios[0].strip()) notes.append("incorporate existing {{%s}} into {{pl-p}}" % template) subsections[k] = re.sub(regex, "", subsections[k], 0, re.M) subsections[k] = pron_prefix + new_pron_template + "\n" + subsections[k] notes.append("insert %s into existing Pronunciation section" % new_pron_template) return True
def add_category(secbody, sectail, pagemsg, notes, cat): separator = "" m = re.match(r"^(.*?\n)(\n*--+\n*)$", sectail, re.S) if m: sectail, separator = m.groups() if re.search(r"\[\[Category:%s(\||\])" % re.escape(cat), secbody + sectail): # Category already present pagemsg("Category 'Hungarian %s' already present" % cat) return secbody, sectail + separator parsed = blib.parse_text(secbody + sectail) for t in parsed.filter_templates(): if tname(t) in ["cln", "catlangname"] and getparam(t, "1") == "hu": for i in range(2, 30): if getparam(t, str(i)) == cat: # Category already present in templatized form pagemsg("Category 'Hungarian %s' already present" % cat) return secbody, sectail + separator # Now add the category to existing {{cln}}, or create one. parsed = blib.parse_text(sectail) for t in parsed.filter_templates(): if tname(t) in ["cln", "catlangname"] and getparam(t, "1") == "hu": for i in range(2, 30): if not getparam(t, str(i)): break else: # no break pagemsg( "WARNING: Something strange, reached 30= in %s and didn't see place to insert" % unicode(t)) return secbody, sectail + separator before = str(i + 1) if getparam( t, str(i + 1)) else "sort" if getparam(t, "sort") else None origt = unicode(t) t.add(str(i), cat, before=before) notes.append("insert '%s' into existing {{%s|hu}}" % (cat, tname(t))) pagemsg("Replaced %s with %s" % (origt, unicode(t))) return secbody, unicode(parsed) + separator # Need to create {{cln}}. newtext = "{{cln|hu|%s}}" % cat sectail = sectail.strip() if sectail: sectail = sectail + "\n" + newtext else: sectail = newtext notes.append("add %s" % newtext) pagemsg("Added %s" % newtext) return secbody.rstrip( "\n") + "\n", "\n" + sectail + "\n\n" + separator.lstrip("\n")
def fix_up_section(sectext, warn_on_multiple_heads): parsed = blib.parse_text(sectext) heads = set() pronun_templates = [] for t in parsed.filter_templates(): tn = tname(t) if lalib.la_template_is_head(t): heads |= set(blib.remove_links(x) for x in lalib.la_get_headword_from_template(t, pagetitle, pagemsg)) elif tn == "la-IPA": pronun_templates.append(t) if len(heads) > 1: if warn_on_multiple_heads: pagemsg("WARNING: Found multiple possible heads, not modifying: %s" % ",".join(heads)) return sectext if len(heads) == 0: pagemsg("WARNING: Found no possible heads, not modifying: %s" % ",".join(heads)) return sectext newsectext = re.sub(r"\{\{a\|Classical\}\} \{\{IPA(char)?\|.*?\}\}", "{{la-IPA|%s}}" % list(heads)[0], sectext) newsectext = re.sub(r"^\* \{\{IPA(char)?\|.*?\|lang=la\}\}", "{{la-IPA|%s}}" % list(heads)[0], newsectext, 0, re.M) if newsectext != sectext: notes.append("replaced manual Latin pronun with {{la-IPA|%s}}" % list(heads)[0]) sectext = newsectext # Recompute pronun templates as we may have added one. parsed = blib.parse_text(sectext) pronun_templates = [] for t in parsed.filter_templates(): tn = tname(t) if tn == "la-IPA": pronun_templates.append(t) if "{{a|Ecclesiastical}} {{IPA" in sectext: if len(pronun_templates) == 0: pagemsg("WARNING: Found manual Ecclesiastical pronunciation but not {{la-IPA}} template") elif len(pronun_templates) > 1: pagemsg("WARNING: Found manual Ecclesiastical pronunciation and multiple {{la-IPA}} templates: %s" % ",".join(unicode(tt) for tt in pronun_templates)) else: origt = unicode(pronun_templates[0]) pronun_templates[0].add("eccl", "yes") pagemsg("Replaced %s with %s" % (origt, unicode(pronun_templates[0]))) newsectext = re.sub(r"^\* \{\{a\|Ecclesiastical\}\} \{\{IPA(char)?\|.*?\}\}\n", "", sectext, 0, re.M) if newsectext == sectext: pagemsg("WARNING: Unable to remove manual Ecclesiastical prounciation") else: notes.append("removed manual Ecclesiastical pronunciation and added |eccl=yes to {{la-IPA}}") sectext = newsectext return sectext
def combine_doublets(m): first = blib.parse_text(m.group(1)) rest = blib.parse_text(m.group(2)) t1 = list(first.filter_templates())[0] if getparam(t1, "3") or getparam(t1, "4") or getparam(t1, "alt2") or getparam(t1, "alt3"): pagemsg("WARNING: Can't combine %s, first template already has multiple terms" % m.group(0)) return m.group(0) next_index = 2 lang = getparam(t1, "1") for t in rest.filter_templates(recursive=False): tlang = getparam(t, "1") if lang != tlang: pagemsg("WARNING: Lang %s in continuation template %s not same as lang %s in first template %s" % ( tlang, unicode(t), lang, unicode(t1))) return m.group(0) for param in t.params: pname = unicode(param.name).strip() pval = unicode(param.value).strip() if not pval: continue if pname == "2": t1.add(str(next_index + 1), pval) elif pname == "3": t1.add("alt%s" % next_index, pval) elif pname == "4": t1.add("t%s" % next_index, pval) elif pname in ["t", "gloss", "tr", "ts", "pos", "lit", "alt", "sc", "id", "g"]: t1.add("%s%s" % (pname, next_index), pval) elif pname in ["t1", "gloss1", "tr1", "ts1", "pos1", "lit1", "alt1", "sc1", "id1", "g1"]: t1.add("%s%s" % (pname[:-1], next_index), pval) elif pname in ["1", "notext", "nocap", "nocat"]: pass else: pagemsg("WARNING: Unrecognized param %s=%s in %s, skipping" % (pname, pval, unicode(t))) return m.group(0) next_index += 1 for param in ["notext", "nocap", "nocat"]: val = getparam(t1, param) rmparam(t1, param) if val: t1.add(param, val) newtext = unicode(t1) pagemsg("Replaced %s with %s" % (m.group(0), newtext)) return newtext
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) def getp(param): return getparam(t, param) if tn == "Wikisource1911Enc Citation": origt = unicode(t) param1 = getp("1") t.add("1", "1911") t.add("2", param1) blib.set_template_name(t, "projectlink") if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append( "convert {{Wikisource1911Enc Citation}} to {{projectlink|1911}}" ) return unicode(parsed), notes
def process_page(index, page, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) russian = blib.find_lang_section_from_text(text, "Russian", pagemsg) if not russian: pagemsg("Couldn't find Russian section for %s" % pagetitle) return subsections = re.split("(^===+[^=\n]+===+\n)", russian, 0, re.M) # Go through each subsection in turn, looking for subsection # matching the POS with an appropriate headword template whose # head matches the inflected form for j in xrange(2, len(subsections), 2): if "==Etymology" in subsections[j - 1]: parsed = blib.parse_text(subsections[j]) for t in parsed.filter_templates(): tname = unicode(t.name) if tname == "diminutive of": pagemsg("WARNING: Found diminutive-of in etymology: %s" % unicode(t))
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) def getp(param): return getparam(t, param) if tn == "mn-variant": origt = unicode(t) m = getp("m") if m: t.add("1", m, before="m") t.add("2", m, before="m") c = getp("c") if c: t.add("3", c, before="c") rmparam(t, "m") rmparam(t, "c") if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append( "Convert m=/c= in {{mn-variant}} to numbered params") return unicode(parsed), notes
def process_section(index, pagetitle, sectext): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) parsed = blib.parse_text(sectext) head = None for t in parsed.filter_templates(): newhead = get_head_param(t, pagetitle) if newhead is not None: newhead = [blib.remove_links(x) for x in newhead] if head and head != newhead: pagemsg("WARNING: Saw multiple heads %s and %s" % (",".join(head), ",".join(newhead))) head = newhead if not head: pagemsg("WARNING: Couldn't find head") saw_pronun = False for t in parsed.filter_templates(): tn = tname(t) if tn == "IPA": if getparam(t, "1") != "ang": pagemsg("WARNING: Wrong-language IPA template: %s" % unicode(t)) continue pagemsg("<from> %s <to> {{ang-IPA|%s}} <end>" % (unicode(t), "|".join(head) or "<<%s>>" % pagetitle)) saw_pronun = True elif tn == "ang-IPA": pagemsg("Saw existing pronunciation: %s" % unicode(t)) saw_pronun = True if not saw_pronun: pagemsg( "WARNING: Didn't see pronunciation for headword %s <new> {{ang-IPA|%s}} <end>" % (",".join(head), "|".join(head)))
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping") return text = unicode(page.text) notes = [] parsed = blib.parse_text(text) for t in parsed.filter_templates(): origt = unicode(t) name = unicode(t.name) if name in fr_head_templates: rmparam(t, "sort") newt = unicode(t) if origt != newt: pagemsg("Replacing %s with %s" % (origt, newt)) notes.append("remove sort= from {{%s}}" % name) return unicode(parsed), notes
def process_page(index, page): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) foundrussian = False sections = re.split("(^==[^=]*==\n)", text, 0, re.M) for j in xrange(2, len(sections), 2): if sections[j - 1] == "==Russian==\n": if foundrussian: pagemsg( "WARNING: Found multiple Russian sections, skipping page") return foundrussian = True found_headword_template = False parsed = blib.parse_text(sections[j]) for t in parsed.filter_templates(): tname = unicode(t.name) if tname == "ru-adj" or (tname == "head" and getparam(t, "1") == "ru" and getparam(t, "2") == "adjective form"): found_headword_template = True if not found_headword_template and "===Adjective===" in sections[j]: pagemsg("WARNING: Missing adj headword template")
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") notes = [] text = unicode(page.text) parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) param = None if tn in ["bg-noun", "bg-proper noun", "bg-verb", "bg-adj", "bg-adv", "bg-part", "bg-part form", "bg-verbal noun", "bg-verbal noun form", "bg-phrase"]: param = "1" elif tn == "head" and getparam(t, "1") == "bg": param = "head" if param: val = getparam(t, param) val = bglib.decompose(val) if GR in val: val = val.replace(GR, AC) t.add(param, val) notes.append("convert grave to acute in {{%s}}" % tn) if unicode(t) != origt: pagemsg("Replaced %s with %s" % (origt, unicode(t))) return unicode(parsed), notes
def do_process_text_on_page(index, pagename, text, adj): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagename, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagename, txt)) pagemsg("Processing") notes = [] if "==Etymology 1==" in text or "==Pronunciation 1==" in text: pagemsg("WARNING: Saw Etymology/Pronunciation 1, can't handle yet") return parsed = blib.parse_text(text) headword = None for t in parsed.filter_templates(): tn = tname(t) if tn in (adj and ["bg-adj"] or ["bg-noun", "bg-proper noun"]): headword = getparam(t, "1") if (tn == "bg-decl-adj" if adj else tn.startswith("bg-noun-")): origt = unicode(t) if not headword: pagemsg("WARNING: Saw %s without {{%s}} headword" % (origt, "bg-adj" if adj else "bg-noun")) continue del t.params[:] t.add("1", "%s<>" % headword) blib.set_template_name(t, "bg-adecl" if adj else "bg-ndecl") pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append("convert {{%s}} to {{%s}}" % (tn, tname(t))) return text, notes
def process_text_on_page(index, pagename, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagename, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagename, txt)) pagemsg("Processing") notes = [] parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) if tn == "RQ:Buk Baibel": param1 = getparam(t, "1") if param1 in book_map: t.add("1", book_map[param1]) notes.append("convert '%s' to '%s' in 1= in {{%s}}" % (param1, book_map[param1], tn)) param4 = getparam(t, "4") if param4: t.add("passage", param4, before="4") rmparam(t, "4") notes.append("4= -> passage= in {{%s}}" % tn) if unicode(t) != origt: pagemsg("Replaced %s with %s" % (origt, unicode(t))) return unicode(parsed), notes
def process_text_on_page(index, pagename, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagename, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagename, txt)) pagemsg("Processing") notes = [] parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) if tn == "head" and getparam(t, "1") == "la": pos = getparam(t, "2") if pos not in pos_to_template: pagemsg("WARNING: Saw unrecognized part of speech %s: %s" % (pos, unicode(t))) continue if getparam(t, "3") or getparam(t, "head"): pagemsg("WARNING: Saw 3= or head=: %s" % unicode(t)) continue origt = unicode(t) t.add("1", pagename) blib.set_template_name(t, pos_to_template[pos]) rmparam(t, "2") t.add("FIXME", "1") pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append("replace {{head|la|%s}} with {{%s}}" % (pos, tname(t))) return unicode(parsed), notes
def investigate_possible_adj(index, adj_pagename, adv, adv_defns): def pagemsg(txt): msg("Page %s %s: %s" % (index, adj_pagename, txt)) pagemsg("Trying for adverb %s" % adv) page = pywikibot.Page(site, adj_pagename) if not page.exists(): pagemsg("Doesn't exist for adverb %s" % adv) return text = unicode(page.text) retval = lalib.find_latin_section(text, pagemsg) if retval is None: return sections, j, secbody, sectail, has_non_latin = retval subsections = re.split("(^===+[^=\n]+===+\n)", secbody, 0, re.M) for k in xrange(2, len(subsections), 2): parsed = blib.parse_text(subsections[k]) for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn in ["la-adj", "la-part"]: adj = lalib.la_get_headword_from_template( t, adj_pagename, pagemsg)[0] adj_defns = lalib.find_defns(subsections[k]) msg("%s /// %s /// %s /// %s" % (adv, adj, ";".join(adv_defns), ";".join(adj_defns)))
def process_text_on_page(index, pagetitle, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] if "es-IPA" not in text and "fr-IPA" not in text and "it-IPA" not in text: return parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) if tn in ["es-IPA", "fr-IPA", "it-IPA"]: must_continue = False for i in xrange(2, 11): if getparam(t, str(i)): pagemsg("Template has %s=, not touching: %s" % (i, origt)) must_continue = True break if must_continue: continue par1 = getparam(t, "1") if par1 == pagetitle: rmparam(t, "1") notes.append("remove redundant 1=%s from {{%s}}" % (par1, tn)) if unicode(t) != origt: pagemsg("Replaced %s with %s" % (origt, unicode(t))) return unicode(parsed), notes
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") notes = [] parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) if tn in ["hi-noun form", "hi-verb form", "hi-adj form"]: g = getparam(t, "g") newg = None if g == "ms": newg = "m-s" elif g == "fs": newg = "f-s" elif g == "mp": newg = "m-p" elif g == "fp": newg = "f-p" if g != newg: t.add("g", newg) notes.append("fix gender in {{%s}}" % tn) if unicode(t) != origt: pagemsg("Replaced %s with %s" % (origt, unicode(t))) return unicode(parsed), notes
def process_text_on_page_for_single_word(index, pagename, text, spec): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagename, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagename, txt)) pagemsg("Processing") notes = [] parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) if tn == "es-verb": if not getparam(t, "attn"): pagemsg("Didn't see attn=1: %s" % unicode(t)) continue rmparam(t, "attn") if "<" in spec: t.add("1", "%s%s" % (pagename, spec)) notes.append("add conjugation %s%s to Spanish verb" % (pagename, spec)) elif spec == "*": notes.append("add conjugation (default) to Spanish verb") else: t.add("pres", spec) notes.append("add conjugation pres=%s to Spanish verb" % spec) if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) return unicode(parsed), notes
def process_text_on_page_for_full_conj(index, pagename, text, verbs): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagename, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagename, txt)) pagemsg("Processing") notes = [] if pagename not in verbs: pagemsg("WARNING: Can't find entry, skipping") return entry = verbs[pagename] origentry = entry first, rest = pagename.split(" ", 1) restwords = rest.split(" ") def_link = "%s<> %s" % (first, " ".join("[[%s]]" % word for word in restwords)) if def_link == entry: pagemsg("Replacing entry '%s' with a blank entry because it's the default" % entry) entry = "" elif re.sub("<.*?>", "<>", entry) == def_link: newentry = blib.remove_links(entry) pagemsg("Replacing entry '%s' with entry without links '%s'" % (entry, newentry)) entry = newentry parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) if tn == "es-verb": if not getparam(t, "attn"): pagemsg("Didn't see attn=1: %s" % unicode(t)) continue rmparam(t, "attn") if entry: t.add("1", entry) notes.append("add conjugation '%s' to Spanish verb" % entry) else: notes.append("add conjugation (default) to Spanish verb") if tn == "head" and getparam(t, "1") == "es" and getparam(t, "2") == "verb": head = getparam(t, "head") if head: pagemsg("WARNING: Removing head=%s compared with entry '%s', original entry '%s': %s" % (head, entry, origentry, unicode(t))) rmparam(t, "head") rmparam(t, "2") rmparam(t, "1") blib.set_template_name(t, "es-verb") if entry: t.add("1", entry) notes.append("convert {{head|es|verb}} to {{es-verb|%s}}" % entry) else: notes.append("convert {{head|es|verb}} to {{es-verb}}") if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) return unicode(parsed), notes
def process_page_for_fix(page, index, parsed): pagename = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagename, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagename, txt)) pagemsg("Processing") notes = [] text = unicode(page.text) newtext = re.sub(r"\[\[(.*?)\]\]", r"{{l|kmr|\1}}", text) if newtext != text: notes.append("convert raw links to {{l|kmr|...}}") text = newtext parsed = blib.parse_text(text) for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn in ["l", "rhymes nav"] and getparam(t, "1") == "ku": t.add("1", "kmr") notes.append("convert {{%s|ku}} to {{%s|kmr}}" % (tn, tn)) elif getparam(t, "1") == "ku": pagemsg("WARNING: Kurdish-language template of unrecognized name: %s" % unicode(t)) if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) text = unicode(parsed) return text, notes
def process_text_on_page(index, pagetitle, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) global args def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) notes = [] pagemsg("Processing") parsed = blib.parse_text(text) for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) newarg1 = None if tn == "de-conj": generate_template = re.sub(r"^\{\{de-conj(?=[|}])", "{{User:Benwing2/de-generate-verb-props", unicode(t)) result = expand_text(generate_template) if not result: continue forms = blib.split_generate_args(result) pagemsg("For %s, class=%s" % (unicode(t), forms["class"])) if unicode(t) != origt: pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t))) return unicode(parsed), notes
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) if not args.stdin: pagemsg("Processing") retval = lalib.find_latin_section(text, pagemsg) if retval is None: return None, None sections, j, secbody, sectail, has_non_latin = retval parsed = blib.parse_text(secbody) for t in parsed.filter_templates(): tn = tname(t) if tn in lalib.la_headword_templates: for head in lalib.la_get_headword_from_template( t, pagetitle, pagemsg): no_macrons_head = remove_macrons(blib.remove_links(head)) if pagetitle.startswith("Reconstruction"): unprefixed_title = "*" + re.sub(".*/", "", pagetitle) else: unprefixed_title = pagetitle if no_macrons_head != unprefixed_title: pagemsg("WARNING: Bad Latin head: %s" % unicode(t)) return None, None
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) def getp(param): return getparam(t, param) if tn in ["de-noun", "de-proper noun"]: auto_old = False for param in [ "old", "2", "3", "4", "g1", "g2", "g3", "gen1", "gen2", "gen3", "pl1", "pl2", "pl3" ]: if getp(param): auto_old = True break if not auto_old: t.add("old", "1") notes.append( "add old=1 to {{%s}} because compatible with new signature" % tn) return unicode(parsed), notes
def replace_spenser_fq(m): template, text = m.groups() parsed = blib.parse_text(template) t = list(parsed.filter_templates())[0] par2 = getparam(t, "2") if par2: canto = arabic_to_roman(par2) if not canto: return m.group(0) t.add("canto", canto, before="2") rmparam(t, "2") par1 = getparam(t, "1") if par1: book = arabic_to_roman(par1) if not book: return m.group(0) t.add("book", book, before="1") rmparam(t, "1") text = re.sub(r"\s*<br */?>\s*", " / ", text) text = re.sub(r"^\{\{quote\|en\|(.*)\}\}$", r"\1", text) t.add("passage", text) blib.set_template_name(t, "RQ:Spenser Faerie Queene") notes.append( "reformat {{RQ:Spenser FQ}} into {{RQ:Spenser Faerie Queene}}") return unicode(t) + "\n"
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping") return text = unicode(page.text) notes = [] parsed = blib.parse_text(text) for t in parsed.filter_templates(): origt = unicode(t) name = unicode(t.name) if name in fr_head_templates: rmparam(t, "sort") newt = unicode(t) if origt != newt: pagemsg("Replacing %s with %s" % (origt, newt)) notes.append("remove sort= from {{%s}}" % name) newtext = unicode(parsed) if newtext != text: assert notes comment = "; ".join(notes) if save: pagemsg("Saving with comment = %s" % comment) page.text = newtext page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def undo_one_page_greek_removal(page, index, text): def pagemsg(txt): msg("Page %s %s: %s" % (index, unicode(page.title()), txt)) template = blib.parse_text(template_text).filter_templates()[0] orig_template = unicode(template) if getparam(template, "sc") == "polytonic": template.remove("sc") to_template = unicode(template) param_value = getparam(template, removed_param) template.remove(removed_param) from_template = unicode(template) text = unicode(text) found_orig_template = orig_template in text newtext = text.replace(from_template, to_template) changelog = "" if newtext == text: if not found_orig_template: pagemsg("WARNING: Unable to locate 'from' template when undoing Greek param removal: %s" % from_template) else: pagemsg("Original template found, taking no action") else: if found_orig_template: pagemsg("WARNING: Undid removal, but original template %s already present!" % orig_template) if len(newtext) - len(text) != len(to_template) - len(from_template): pagemsg("WARNING: Length mismatch when undoing Greek param removal, may have matched multiple templates: from=%s, to=%s" % ( from_template, to_template)) changelog = "Undid removal of %s=%s in %s" % (removed_param, param_value, to_template) pagemsg("Change log = %s" % changelog) return newtext, changelog
def find_head_comp_sup(pagetitle, pagemsg): page = pywikibot.Page(site, pagetitle) text = unicode(page.text) parsed = blib.parse_text(text) for t in parsed.filter_templates(): if tname(t) == "la-adv": head = getparam(t, "1") comp = getparam(t, "comp") or getparam(t, "2") sup = getparam(t, "sup") or getparam(t, "3") if not comp or not sup: for suff in [ "iter", "nter", "ter", "er", u"iē", u"ē", "im", u"ō" ]: m = re.search("^(.*?)%s$" % suff, head) if m: stem = m.group(1) if suff == "nter": stem += "nt" default_comp = stem + "ius" default_sup = stem + u"issimē" break else: pagemsg( "WARNING: Didn't recognize ending of adverb headword %s" % head) return head, comp, sup comp = comp or default_comp sup = sup or default_sup return head, comp, sup return None, None, None
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") notes = [] text = unicode(page.text) parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) if tn == "head" and getparam(t, "1") == "ang" and getparam( t, "2") in ["adjective", "adjectives"]: pagemsg("WARNING: {{head}} for adjectives, should not occur: %s" % unicode(t)) elif tn == "ang-adj": if getparam(t, "1"): pagemsg("WARNING: 1= in ang-adj, should not occur: %s" % unicode(t)) else: head = getparam(t, "head") rmparam(t, "head") if head: t.add("1", head) notes.append("move head= to 1= in {{ang-adj}}") if unicode(t) != origt: pagemsg("Replaced %s with %s" % (origt, unicode(t))) return parsed, notes
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) notes = [] parsed = blib.parse_text(text) rhymes_templates = args.rhymes_templates.decode("utf-8").split(",") if args.skip_langs: skip_lang_codes = args.skip_langs.decode("utf-8").split(",") else: skip_lang_codes = [] if args.include_langs: include_lang_codes = args.include_langs.decode("utf-8").split(",") else: include_lang_codes = [] for t in parsed.filter_templates(): if tname(t) in rhymes_templates: langcode = getparam(t, "1") if include_lang_codes and getparam(t, "1") not in include_lang_codes: continue if skip_lang_codes and langcode in skip_lang_codes: continue expanded = expand_text(unicode(t)) if not expanded: continue for cattext in re.findall(r"\[\[Category:Rhymes:.*?\]\]", expanded): pagemsg("Found rhymes category: %s" % cattext[2:-2])
def process_page(index, page): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) foundrussian = False sections = re.split("(^==[^=]*==\n)", text, 0, re.M) for j in xrange(2, len(sections), 2): if sections[j-1] == "==Russian==\n": if foundrussian: pagemsg("WARNING: Found multiple Russian sections, skipping page") return foundrussian = True found_headword_template = False parsed = blib.parse_text(sections[j]) for t in parsed.filter_templates(): tname = unicode(t.name) if tname == "ru-adj" or (tname == "head" and getparam(t, "1") == "ru" and getparam(t, "2") == "adjective form"): found_headword_template = True if not found_headword_template and "===Adjective===" in sections[j]: pagemsg("WARNING: Missing adj headword template")
def get_pl_p_property(index, pagetitle): if pagetitle in pages_with_pl_p: return pages_with_pl_p[pagetitle] page = pywikibot.Page(site, pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagetext = blib.safe_page_text(page, pagemsg) parsed = blib.parse_text(pagetext) saw_pl_p = False respellings = [] for t in parsed.filter_templates(): tn = tname(t) if tn in ["pl-p", "pl-pronunciation"]: def getp(param): return getparam(t, param) saw_pl_p = True for pno in range(1, 11): respelling = getp(str(pno)) if respelling and respelling not in respellings: respellings.append(respelling) if respellings: retval = ("pl-p-respelling", respellings) elif saw_pl_p: retval = ("pl-p-no-respelling", None) else: retval = ("no-pl-p", None) pages_with_pl_p[pagetitle] = retval return retval
def find_noun(pagename, pagemsg, errandpagemsg, expand_text): section = blib.find_lang_section(pagename, "Russian", pagemsg, errandpagemsg) if not section: return None if "==Etymology" in section: return -1 parsed = blib.parse_text(section) nouns = [] for t in parsed.filter_templates(): if unicode(t.name) == "ru-noun+": generate_template = re.sub(r"^\{\{ru-noun\+", "{{ru-generate-noun-forms", unicode(t)) generate_result = expand_text(generate_template) if not generate_result: pagemsg("WARNING: Error generating noun forms") return None args = blib.split_generate_args(generate_result) lemma = args["nom_sg"] if "nom_sg" in args else args["nom_pl"] if "," in lemma: pagemsg("WARNING: Lemma has multiple forms: %s" % lemma) return None if lemma not in nouns: nouns.append(lemma) if len(nouns) > 1: pagemsg("WARNING: Multiple lemmas for noun: %s" % ",".join(nouns)) if not nouns: return None return nouns[0]
def replace_trans(m, newlangcode, newlangname): prefix, transtext = m.groups() parsed = blib.parse_text(transtext) for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn in trans_templates: if getparam(t, "1") == "ku": t.add("1", newlangcode) rmparam(t, "sc") pagemsg( "Replaced %s with %s based on language prefix of translation entry" % (origt, unicode(t))) notes.append( "{{%s|ku}} -> {{%s|%s}} based on language prefix of translation entry" % (tn, tn, newlangcode)) elif tn == "t-simple": if getparam(t, "1") == "ku": if getparam(t, "langname" != "Kurdish"): pagemsg( "WARNING: Something wrong, t-simple|ku without langname=Kurdish: %s" % unicode(t)) else: t.add("1", newlangcode) t.add("langname", newlangname) pagemsg("Replaced %s with %s based on prefix" % (origt, unicode(t))) notes.append( "{{t-simple|ku|langname=Kurdish}} -> {{t-simple|%s|langname=%s}} based on language prefix" % (newlangcode, newlangname)) transtext = unicode(parsed) return prefix + transtext
def test_infer(): class Page: def title(self): return "test_infer" for pagetext in test_templates: text = blib.parse_text(pagetext) page = Page() newtext, comment = infer_one_page_decls(page, 1, text) msg("newtext = %s" % unicode(newtext)) msg("comment = %s" % comment)
def get_form_class(k): formclass = None parsed = blib.parse_text(etymologies[j]) for t in parsed.filter_templates(): if t.name in ["ar-verb", "ar-verb-form"]: newformclass = getparam(t, "1") if formclass and newformclass and formclass != newformclass: pagemsg("WARNING: Something wrong: Two different verb form classes in same etymology: %s != %s" % (formclass, newformclass)) formclass = newformclass return formclass
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping page") return text = unicode(page.text) notes = [] sections = re.split("(^==[^=]*==\n)", text, 0, re.M) for j in xrange(2, len(sections), 2): m = re.search("^==(.*?)==\n", sections[j-1]) lang = m.group(1) parsed = blib.parse_text(sections[j]) for t in parsed.filter_templates(): if unicode(t.name) == "audio" and not getparam(t, "lang"): origt = unicode(t) if lang in langs_to_codes: langcode = langs_to_codes[lang] else: langcode = expand_text("{{#invoke:languages/templates|getByCanonicalName|%s|getCode}}" % lang) if not langcode: pagemsg("WARNING: Unable to find code for lang %s" % lang) continue langs_to_codes[lang] = langcode t.add("lang", langcode) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) sections[j] = unicode(parsed) new_text = "".join(sections) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) comment = "add lang code to audio templates" if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping page") return text = unicode(page.text) notes = [] foundrussian = False sections = re.split("(^==[^=]*==\n)", text, 0, re.M) for j in xrange(2, len(sections), 2): if sections[j-1] == "==Russian==\n": if foundrussian: pagemsg("WARNING: Found multiple Russian sections, skipping page") return foundrussian = True # Remove gender from adjective forms parsed = blib.parse_text(sections[j]) for t in parsed.filter_templates(): if unicode(t.name) == "head" and getparam(t, "1") == "ru" and getparam(t, "2") == "adjective form": origt = unicode(t) rmparam(t, "g") rmparam(t, "g2") rmparam(t, "g3") rmparam(t, "g4") newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("remove gender from adjective forms") sections[j] = unicode(parsed) new_text = "".join(sections) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(blib.group_notes(notes)) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def combine_verbs(m): verb1 = m.group(1) verb2 = m.group(3) if m.group(2): pagemsg("WARNING: Would combine verbs but found text '%s' needing to go into a note, skipping: %s and %s" % (m.group(2), verb1, verb2)) return m.group(0) t1 = blib.parse_text(verb1).filter_templates()[0] t2 = blib.parse_text(verb2).filter_templates()[0] for t in [t1, t2]: for param in t.params: if not re.search("^[0-9]+$", unicode(param.name)): pagemsg("Verb conjugation has non-numeric args, skipping: %s" % unicode(t)) return m.group(0) params = fetch_numbered_params(t1) params.append("or") newparams = fetch_numbered_params(t2) if len(newparams) < 2: pagemsg("WARNING: Something wrong, no verb type in ru-conj: %s" % unicode(t2)) return m.group(0) vt1 = getparam(t1, "1") vt2 = getparam(t2, "1") if vt1 != vt2: pagemsg("WARNING: Can't combine verbs of different verb types: %s and %s" % (verb1, verb2)) return m.group(0) del newparams[0] params.extend(newparams) blib.set_param_chain(t1, params, "1", "") pagemsg("Combining verb conjugations %s and %s" % ( getparam(t1, "1"), getparam(t2, "1"))) pagemsg("Replaced %s with %s" % (m.group(0).replace("\n", r"\n"), unicode(t1))) notes.append("combined verb conjugations %s and %s" % ( getparam(t1, "1"), getparam(t2, "1"))) return unicode(t1)
def find_old_template_props(template, pagemsg, verbose): name = unicode(template.name) if name in cached_template_calls: template_text = cached_template_calls[name] else: template_page = pywikibot.Page(site, "Template:%s" % name) if not page.exists(): pagemsg("WARNING: Can't locate template 'Template:%s'" % name) return None template_text = unicode(template_page.text) cached_template_calls[name] = template_text if verbose: pagemsg("Found template text: %s" % template_text) for t in blib.parse_text(template_text).filter_templates(): tname = unicode(t.name).strip() # template name may have spaces if tname == "fr-conj" or tname == "#invoke:fr-conj" and getparam(t, "1").strip() == "frconj": args = {} # Yuck. Template param names sometimes have spaces in them; must strip. tparams = [(unicode(param.name.strip()), unicode(param.value.strip())) for param in t.params] tparamdict = dict(tparams) debug_args = [] def sub_template(val): val = re.sub(r"\{\{\{1\|?\}\}\}", getparam(template, "1"), val) val = re.sub(r"\{\{\{2\|?\}\}\}", getparam(template, "2"), val) val = re.sub(r"\{\{\{pp\|(.*?)\}\}\}", lambda m:getparam(template, "pp") or m.group(1), val) return val for pname, pval in tparams: canonpname = re.sub(r"\.", "_", pname) if canonpname in all_verb_props: pval = sub_template(pval) pnamealt = pname + ".alt" pvalalt = tparamdict.get(pnamealt, "") pvalalt = sub_template(pvalalt) if pval in ["N/A", "-"]: pval = "" if pvalalt in ["N/A", "-"]: pvalalt = "" vals = [x for x in [pval, pvalalt] if x] pval = ",".join(vals) if pval and not re.search(r"—", pval): debug_args.append("%s=%s" % (canonpname, pval)) args[canonpname] = pval pagemsg("Found args: %s" % "|".join(debug_args)) return args pagemsg("WARNING: Can't find {{fr-conj}} in template definition for %s" % unicode(template)) return None
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping") return def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) origtext = page.text parsed = blib.parse_text(origtext) # Find the declension arguments for LEMMA and inflected form INFL, # the WORDINDth word in the expression. Return value is a tuple of # four items: a list of (NAME, VALUE) tuples for the arguments, whether # the word is an adjective, the value of n= (if given), and the value # of a= (if given). def find_decl_args(lemma, infl, wordind): declpage = pywikibot.Page(site, lemma) if ru.remove_accents(infl) == lemma: wordlink = "[[%s]]" % infl else: wordlink = "[[%s|%s]]" % (lemma, infl) if not declpage.exists(): if lemma in is_short_adj or re.search(u"(ий|ый|ой)$", lemma): pagemsg("WARNING: Page doesn't exist, assuming word #%s adjectival: lemma=%s, infl=%s" % (wordind, lemma, infl)) return [("1", wordlink), ("2", "+")], True, None, None else: pagemsg("WARNING: Page doesn't exist, can't locate decl for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None parsed = blib.parse_text(declpage.text) decl_templates = [] headword_templates = [] decl_z_templates = [] for t in parsed.filter_templates(): tname = unicode(t.name) if tname in ["ru-noun-table", "ru-decl-adj"]: pagemsg("find_decl_args: Found decl template: %s" % unicode(t)) decl_templates.append(t) if tname in ["ru-noun", "ru-proper noun"]: pagemsg("find_decl_args: Found headword template: %s" % unicode(t)) headword_templates.append(t) if tname in ["ru-decl-noun-z"]: pagemsg("find_decl_args: Found z-decl template: %s" % unicode(t)) decl_z_templates.append(t) if not decl_templates: if decl_z_templates: # {{ru-decl-noun-z|звезда́|f-in|d|ё}} # {{ru-decl-noun-z|ёж|m-inan|b}} if len(decl_z_templates) > 1: pagemsg("WARNING: Multiple decl-z templates during decl lookup for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None else: decl_z_template = decl_z_templates[0] headword_template = None pagemsg("find_decl_args: Using z-decl template: %s" % unicode(decl_z_template)) if len(headword_templates) == 0: pagemsg("WARNING: find_decl_args: No headword templates for use with z-decl template conversion during decl lookup for word #%s: lemma=%s, infl=%s, zdecl=%s" % (wordind, lemma, infl, unicode(decl_z_template))) elif len(headword_templates) > 1: pagemsg("WARNING: find_decl_args: Multiple headword templates for use with z-decl template conversion during decl lookup for word #%s, ignoring: lemma=%s, infl=%s, zdecl=%s" % (wordind, lemma, infl, unicode(decl_z_template))) else: headword_template = headword_templates[0] pagemsg("find_decl_args: For word #%s, lemma=%s, infl=%s, using headword template %s for use with z-decl template %s" % (wordind, lemma, infl, unicode(headword_template), unicode(decl_z_template))) decl_template = runoun.convert_zdecl_to_ru_noun_table(decl_z_template, subpagetitle, pagemsg, headword_template=headword_template) decl_templates = [decl_template] elif "[[Category:Russian indeclinable nouns]]" in declpage.text or [ x for x in headword_templates if getparam(x, "3") == "-"]: return [("1", wordlink), ("2", "$")], False, None, None else: pagemsg("WARNING: No decl template during decl lookup for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None if len(decl_templates) == 1: decl_template = decl_templates[0] else: # Multiple decl templates for t in decl_templates: if unicode(t.name) == "ru-decl-adj" and re.search(u"(ий|ый|ой)$", lemma): pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s, assuming adjectival: lemma=%s, infl=%s" % (wordind, lemma, infl)) decl_template = t break else: if lemma in use_given_decl: overriding_decl = use_given_decl[lemma] pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" % (wordind, overriding_decl, lemma, infl)) decl_template = blib.parse_text(overriding_decl).filter_templates()[0] elif pagetitle in use_given_page_decl: overriding_decl = use_given_page_decl[pagetitle].get(lemma, None) if not overriding_decl: pagemsg("WARNING: Missing entry for ambiguous-decl lemma for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return else: pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" % (wordind, overriding_decl, lemma, infl)) decl_template = blib.parse_text(overriding_decl).filter_templates()[0] else: pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None pagemsg("find_decl_args: Using decl template: %s" % unicode(decl_template)) if unicode(decl_template.name) == "ru-decl-adj": if re.search(ur"\bь\b", getparam(decl_template, "2"), re.U): return [("1", wordlink), ("2", u"+ь")], True, None, None else: return [("1", wordlink), ("2", "+")], True, None, None # ru-noun-table assert unicode(decl_template.name) == "ru-noun-table" # Split out the arg sets in the declension and check the # lemma of each one, taking care to handle cases where there is no lemma # (it would default to the page name). highest_numbered_param = 0 for p in decl_template.params: pname = unicode(p.name) if re.search("^[0-9]+$", pname): highest_numbered_param = max(highest_numbered_param, int(pname)) # Now gather the numbered arguments into arg sets. Code taken from # ru-noun.lua. offset = 0 arg_sets = [] arg_set = [] for i in xrange(1, highest_numbered_param + 2): end_arg_set = False val = getparam(decl_template, str(i)) if i == highest_numbered_param + 1: end_arg_set = True elif val == "_" or val == "-" or re.search("^join:", val): pagemsg("WARNING: Found multiword decl during decl lookup for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None elif val == "or": end_arg_set = True if end_arg_set: arg_sets.append(arg_set) arg_set = [] offset = i else: arg_set.append(val) canon_infl = ru.remove_accents(infl).lower() canon_lemma = lemma.lower() ispl = False need_sc1 = False found_gender = None if canon_infl != canon_lemma: for sgend, plend, gender, is_sc1 in pl_data: if sgend: check_sgend = sgend else: check_sgend = consonant_re if re.search(check_sgend + "$", canon_lemma) and canon_infl == re.sub(sgend + "$", plend, canon_lemma): ispl = True found_gender = gender need_sc1 = is_sc1 break else: pagemsg("WARNING: For word#%s, inflection not same as lemma, not recognized as plural, can't handle, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None # Substitute the wordlink for any lemmas in the declension. # If plural, also add gender and verify special case (1) as necessary. # Concatenate all the numbered params, substituting the wordlink into # the lemma as necessary. numbered_params = [] for arg_set in arg_sets: lemma_arg = 0 if len(arg_set) > 0 and runoun.arg1_is_stress(arg_set[0]): lemma_arg = 1 if len(arg_set) <= lemma_arg: arg_set.append("") arglemma = arg_set[lemma_arg] manualtr = "" if "//" in arglemma: arglemma, manualtr = re.search("^(.*?)(//.*?)$", arglemma).groups() if (not arglemma or arglemma.lower() == infl.lower() or ru.is_monosyllabic(infl) and ru.remove_accents(arglemma).lower() == ru.remove_accents(infl).lower() or ispl and ru.remove_accents(arglemma).lower() == lemma.lower() ): arg_set[lemma_arg] = wordlink + manualtr else: pagemsg("WARNING: Can't sub word link %s into decl lemma %s%s" % ( wordlink, arg_set[lemma_arg], ispl and ", skipping" or "")) if ispl: return None if ispl: # Add the gender if len(arg_set) <= lemma_arg + 1: arg_set.append("") declarg = arg_set[lemma_arg + 1] # First, sub in gender m = re.search("(3f|[mfn])", declarg) if found_gender == "mf": if not m: pagemsg(u"WARNING: For singular in -ь and plural in -и, need gender in singular and don't have it, word #%s, skipping: lemma=%s, infl=%s" % (wordinfl, lemma, infl)) return None decl_gender = m.group(1) if decl_gender == "n": pagemsg(u"WARNING: For singular in -ь and plural in -и, can't have neuter gender for word #%s, skipping: lemma=%s, infl=%s" % (wordinfl, lemma, infl)) return None elif decl_gender in ["m", "3f"]: pagemsg(u"Singular in -ь and plural in -и, already found gender %s in decl for word #%s, taking no action: lemma=%s, infl=%s" % (decl_gender, wordind, lemma, infl)) else: assert gender == "f" pagemsg(u"Singular in -ь and plural in -и, replacing f with 3f so singular will be recognized for word #%s: lemma=%s, infl=%s" % (wordind, lemma, infl)) declarg = re.sub("f", "3f", declarg, 1) else: if m: decl_gender = m.group(1) if decl_gender == found_gender: pagemsg("Already found gender %s in decl for word #%s, taking no action: lemma=%s, infl=%s" % (found_gender, wordind, lemma, infl)) else: pagemsg("WARNING: Found wrong gender %s in decl for word #%s, forcibly replacing with lemma-form-derived gender %s: lemma=%s, infl=%s" % (decl_gender, wordind, found_gender, lemma, infl)) declarg = re.sub("(3f|[mfn])", found_gender, declarg, 1) else: pagemsg("No gender in decl for word #%s, adding gender %s: lemma=%s, infl=%s" % (wordind, found_gender, lemma, infl)) declarg = found_gender + declarg # Now check special case 1 if need_sc1 != ("(1)" in declarg): if need_sc1: pagemsg("WARNING: Irregular plural calls for special case (1), but not present in decl arg for word #%s, skipping: declarg=%s, lemma=%s, infl=%s" % ( wordind, declarg, lemma, infl)) return None else: pagemsg("WARNING: Special case (1) present in decl arg but plural for word #%s is regular, skipping: declarg=%s, lemma=%s, infl=%s" % ( wordind, declarg, lemma, infl)) return None arg_set[lemma_arg + 1] = declarg if numbered_params: numbered_params.append("or") numbered_params.extend(arg_set) # Now gather all params, including named ones. params = [] params.extend((str(i+1), val) for i, val in zip(xrange(len(numbered_params)), numbered_params)) num = None anim = None for p in decl_template.params: pname = unicode(p.name) val = unicode(p.value) if pname == "a": anim = val elif pname == "n": num = val elif pname == "notes": params.append((pname, val)) elif pname == "title": pagemsg("WARNING: Found explicit title= for word #%s, ignoring: lemma=%s, infl=%s, title=%s" % (wordind, lemma, infl, val)) elif re.search("^[0-9]+$", pname): pass else: keepparam = True if pname == "loc": if pagetitle in keep_locative: pagemsg("Keeping locative for word #%s because page in keep_locative: loc=%s, lemma=%s, infl=%s" % ( wordind, val, lemma, infl)) else: pagemsg("WARNING: Discarding locative for word #%s: loc=%s, lemma=%s, infl=%s" % ( wordind, val, lemma, infl)) keepparam = False if pname == "par": pagemsg("WARNING: Discarding partitive for word #%s: par=%s, lemma=%s, infl=%s" % ( wordind, val, lemma, infl)) keepparam = False if pname == "voc": pagemsg("WARNING: Discarding vocative for word #%s: voc=%s, lemma=%s, infl=%s" % ( wordind, val, lemma, infl)) keepparam = False if keepparam: if pname == "loc" and re.search(ur"^(на|в)\b", val, re.U): pagemsg(u"WARNING: на or в found in loc= for word #%s, may not work in multi-word lemma: loc=%s, lemma=%s, infl=%s" % (wordind, val, lemma, infl)) pname += str(wordind) params.append((pname, val))
if plval and plval != "-": if overall_num != "both": pagemsg("WARNING: Proper noun is apparently sg/pl but main noun not, skipping: %s" % headword) return elif overall_num == "both": pagemsg("WARNING: Proper noun has sg/pl main noun underlying it, assuming singular: %s" % headword) overall_num = None elif overall_num == "sg": overall_num = None if overall_num: params.append(("n", overall_num)) generate_template = ( blib.parse_text("{{ru-generate-noun-args}}").filter_templates()[0]) for name, value in params: generate_template.add(name, value) proposed_template_text = unicode(generate_template) if headword_is_proper: proposed_template_text = re.sub(r"^\{\{ru-generate-noun-args", "{{ru-proper noun+", proposed_template_text) else: proposed_template_text = re.sub(r"^\{\{ru-generate-noun-args", "{{ru-noun+", proposed_template_text) proposed_decl = blib.parse_text("{{ru-noun-table}}").filter_templates()[0] for param in generate_template.params: proposed_decl.add(param.name, param.value) def pagemsg_with_proposed(text): pagemsg("Proposed new template (WARNING, omits explicit gender and params to preserve from old template): %s" % proposed_template_text)
def process_page_section(index, page, section, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) if not page.exists(): pagemsg("WARNING: Page doesn't exist, skipping") return None parsed = blib.parse_text(section) noun_table_templates = [] noun_old_templates = [] for t in parsed.filter_templates(): if unicode(t.name) == "ru-decl-noun-see": pagemsg("Found ru-decl-noun-see, skipping") return None for t in parsed.filter_templates(): if unicode(t.name) == "ru-noun-table": noun_table_templates.append(t) if unicode(t.name) == "ru-noun-old": noun_old_templates.append(t) if len(noun_table_templates) > 1: pagemsg("WARNING: Found multiple ru-noun-table templates, skipping") return None if len(noun_old_templates) > 1: pagemsg("WARNING: Found multiple ru-noun-old templates, skipping") return None if len(noun_table_templates) < 1: if noun_old_templates: pagemsg("WARNING: No ru-noun-table templates but found ru-noun-old template(s): %s" % ", ".join(unicode(x) for x in noun_old_templates)) return unicode(parsed), 0, 0, 0, 0 for t in parsed.filter_templates(): if unicode(t.name) in ["ru-noun", "ru-proper noun"]: pagemsg("Found ru-noun or ru-proper noun, skipping") return None headword_templates = [] for t in parsed.filter_templates(): if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]: headword_templates.append(t) if len(headword_templates) > 1: pagemsg("WARNING: Found multiple headword templates, skipping") return None if len(headword_templates) < 1: return unicode(parsed), 0, 0, 0, 0 noun_table_template = noun_table_templates[0] noun_old_template = noun_old_templates[0] if len(noun_old_templates) == 1 else None headword_template = headword_templates[0] decl_templates = [x for x in [noun_table_template, noun_old_template] if x] if verbose: pagemsg("Found headword template: %s" % unicode(headword_template)) pagemsg("Found decl template: %s" % unicode(noun_table_template)) if noun_old_template: pagemsg("Found old decl template: %s" % unicode(noun_old_template)) orig_headword_template = unicode(headword_template) orig_noun_table_template = unicode(noun_table_template) genders = blib.fetch_param_chain(headword_template, "g", "g") masculines = blib.fetch_param_chain(headword_template, "m", "m") feminines = blib.fetch_param_chain(headword_template, "f", "f") notrcat = getparam(headword_template, "notrcat") filtered_headword_params = [] for param in headword_template.params: name = unicode(param.name) if re.search("^[gmf][0-9]*$", name) or name == "notrcat": pass else: filtered_headword_params.append((param.name, param.value)) filtered_headword_template = blib.parse_text("{{ru-noun+}}").filter_templates()[0] for name, value in filtered_headword_params: filtered_headword_template.add(name, value) ru_noun_table_cleaned = 0 ru_noun_table_link_copied = 0 ru_noun_changed = 0 ru_proper_noun_changed = 0 new_decl_params = [] for param in noun_table_template.params: name = unicode(param.name) if re.search("^[gmf][0-9]*$", name): pagemsg("WARNING: Found g=, m= or f= in noun-table, removing: %s" % unicode(noun_table_template)) else: new_decl_params.append((param.name, param.value)) del noun_table_template.params[:] for name, value in new_decl_params: noun_table_template.add(name, value) if orig_noun_table_template != unicode(noun_table_template): ru_noun_table_cleaned = 1 modified_noun_table_template = blib.parse_text("{{ru-noun-table}}").filter_templates()[0] for param in noun_table_template.params: modified_noun_table_template.add(param.name, param.value) # If proper noun and n is both then we need to add n=both because # proper noun+ defaults to n=sg if unicode(headword_template.name) == "ru-proper noun+": generate_template = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args", unicode(noun_table_template)) generate_result = expand_text(generate_template) if not generate_result: pagemsg("WARNING: Error generating noun args, skipping") return None args = ru.split_generate_args(generate_result) # If proper noun and n is both then we need to add n=both because # proper noun+ defaults to n=sg if args["n"] == "b" and not getparam(modified_noun_table_template, "n"): pagemsg("Adding n=both to headword template") modified_noun_table_template.add("n", "both") # Correspondingly, if n is sg then we can usually remove n=sg; # but we need to check that the number is actually sg with n=sg # removed because of the possibility of plurale tantum lemmas if args["n"] == "s": generate_template_with_ndef = generate_template.replace("}}", "|ndef=sg}}") generate_template_with_ndef = re.sub(r"\|n=s[^=|{}]*", "", generate_template_with_ndef) generate_result = expand_text(generate_template_with_ndef) if not generate_result: pagemsg("WARNING: Error generating noun args, skipping") return None ndef_args = ru.split_generate_args(generate_result) if ndef_args["n"] == "s": existing_n = getparam(headword_template, "n") if existing_n and not re.search(r"^s", existing_n): pagemsg("WARNING: Something wrong: Found n=%s, not singular" % existing_n) pagemsg("Removing n=sg from headword template") rmparam(modified_noun_table_template, "n") else: pagemsg("WARNING: Unable to remove n= from headword template because n=%s" % ndef_args["n"]) new_headword_template = re.sub(r"^\{\{ru-noun-table", "{{ru-noun+", unicode(modified_noun_table_template)) existing_filtered_headword_template = unicode(filtered_headword_template) change_existing_headword = False if existing_filtered_headword_template != new_headword_template: if "[" in existing_filtered_headword_template and "[" not in new_headword_template: if blib.remove_links(existing_filtered_headword_template) == new_headword_template: pagemsg("Headword has links but decl doesn't and they're otherwise the same, copying headword to decl") del noun_table_template.params[:] for param in filtered_headword_template.params: noun_table_template.add(param.name, param.value) ru_noun_table_link_copied = 1 ru_noun_table_cleaned = 0 else: pagemsg("WARNING: Existing headword template %s would be overwritten with %s but links would be erased, not doing it, check manually" % (existing_filtered_headword_template, new_headword_template)) return None else: pagemsg("WARNING: Existing headword template %s will be overwritten with %s" % (existing_filtered_headword_template, new_headword_template)) change_existing_headword = True if change_existing_headword and (not lemmas or pagetitle in lemmas): del headword_template.params[:] for param in modified_noun_table_template.params: headword_template.add(param.name, param.value) blib.set_param_chain(headword_template, genders, "g", "g") blib.set_param_chain(headword_template, masculines, "m", "m") blib.set_param_chain(headword_template, feminines, "f", "f") if notrcat: headword_template.add("notrcat", notrcat) #genders = runoun.check_old_noun_headword_forms(headword_template, args, # subpagetitle, pagemsg) #if genders == None: # return None #new_params = [] #for param in noun_table_template.params: # new_params.append((param.name, param.value)) #params_to_preserve = runoun.fix_old_headword_params(headword_template, # new_params, genders, pagemsg) #if params_to_preserve == None: # return None new_noun_table_template = unicode(noun_table_template) if new_noun_table_template != orig_noun_table_template: pagemsg("Replacing noun table %s with %s" % (orig_noun_table_template, new_noun_table_template)) new_headword_template = unicode(headword_template) if new_headword_template != orig_headword_template: pagemsg("Replacing headword %s with %s" % (orig_headword_template, new_headword_template)) if unicode(headword_template.name) == "ru-noun+": ru_noun_changed = 1 else: ru_proper_noun_changed = 1 return unicode(parsed), ru_noun_table_cleaned, ru_noun_table_link_copied, ru_noun_changed, ru_proper_noun_changed
def find_decl_args(lemma, infl, wordind): declpage = pywikibot.Page(site, lemma) if ru.remove_accents(infl) == lemma: wordlink = "[[%s]]" % infl else: wordlink = "[[%s|%s]]" % (lemma, infl) if not declpage.exists(): if lemma in is_short_adj or re.search(u"(ий|ый|ой)$", lemma): pagemsg("WARNING: Page doesn't exist, assuming word #%s adjectival: lemma=%s, infl=%s" % (wordind, lemma, infl)) return [("1", wordlink), ("2", "+")], True, None, None else: pagemsg("WARNING: Page doesn't exist, can't locate decl for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None parsed = blib.parse_text(declpage.text) decl_templates = [] headword_templates = [] decl_z_templates = [] for t in parsed.filter_templates(): tname = unicode(t.name) if tname in ["ru-noun-table", "ru-decl-adj"]: pagemsg("find_decl_args: Found decl template: %s" % unicode(t)) decl_templates.append(t) if tname in ["ru-noun", "ru-proper noun"]: pagemsg("find_decl_args: Found headword template: %s" % unicode(t)) headword_templates.append(t) if tname in ["ru-decl-noun-z"]: pagemsg("find_decl_args: Found z-decl template: %s" % unicode(t)) decl_z_templates.append(t) if not decl_templates: if decl_z_templates: # {{ru-decl-noun-z|звезда́|f-in|d|ё}} # {{ru-decl-noun-z|ёж|m-inan|b}} if len(decl_z_templates) > 1: pagemsg("WARNING: Multiple decl-z templates during decl lookup for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None else: decl_z_template = decl_z_templates[0] headword_template = None pagemsg("find_decl_args: Using z-decl template: %s" % unicode(decl_z_template)) if len(headword_templates) == 0: pagemsg("WARNING: find_decl_args: No headword templates for use with z-decl template conversion during decl lookup for word #%s: lemma=%s, infl=%s, zdecl=%s" % (wordind, lemma, infl, unicode(decl_z_template))) elif len(headword_templates) > 1: pagemsg("WARNING: find_decl_args: Multiple headword templates for use with z-decl template conversion during decl lookup for word #%s, ignoring: lemma=%s, infl=%s, zdecl=%s" % (wordind, lemma, infl, unicode(decl_z_template))) else: headword_template = headword_templates[0] pagemsg("find_decl_args: For word #%s, lemma=%s, infl=%s, using headword template %s for use with z-decl template %s" % (wordind, lemma, infl, unicode(headword_template), unicode(decl_z_template))) decl_template = runoun.convert_zdecl_to_ru_noun_table(decl_z_template, subpagetitle, pagemsg, headword_template=headword_template) decl_templates = [decl_template] elif "[[Category:Russian indeclinable nouns]]" in declpage.text or [ x for x in headword_templates if getparam(x, "3") == "-"]: return [("1", wordlink), ("2", "$")], False, None, None else: pagemsg("WARNING: No decl template during decl lookup for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None if len(decl_templates) == 1: decl_template = decl_templates[0] else: # Multiple decl templates for t in decl_templates: if unicode(t.name) == "ru-decl-adj" and re.search(u"(ий|ый|ой)$", lemma): pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s, assuming adjectival: lemma=%s, infl=%s" % (wordind, lemma, infl)) decl_template = t break else: if lemma in use_given_decl: overriding_decl = use_given_decl[lemma] pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" % (wordind, overriding_decl, lemma, infl)) decl_template = blib.parse_text(overriding_decl).filter_templates()[0] elif pagetitle in use_given_page_decl: overriding_decl = use_given_page_decl[pagetitle].get(lemma, None) if not overriding_decl: pagemsg("WARNING: Missing entry for ambiguous-decl lemma for word #%s, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return else: pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, using overriding declension %s: lemma=%s, infl=%s" % (wordind, overriding_decl, lemma, infl)) decl_template = blib.parse_text(overriding_decl).filter_templates()[0] else: pagemsg("WARNING: Multiple decl templates during decl lookup for word #%s and not adjectival, skipping: lemma=%s, infl=%s" % (wordind, lemma, infl)) return None pagemsg("find_decl_args: Using decl template: %s" % unicode(decl_template)) if unicode(decl_template.name) == "ru-decl-adj": if re.search(ur"\bь\b", getparam(decl_template, "2"), re.U): return [("1", wordlink), ("2", u"+ь")], True, None, None else: return [("1", wordlink), ("2", "+")], True, None, None
def process_page_section(index, page, section, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) if not page.exists(): pagemsg("WARNING: Page doesn't exist, skipping") return None parsed = blib.parse_text(section) noun_table_templates = [] noun_old_templates = [] for t in parsed.filter_templates(): if unicode(t.name) == "ru-decl-noun-see": pagemsg("Found ru-decl-noun-see, skipping") return None for t in parsed.filter_templates(): if unicode(t.name) == "ru-noun-table": noun_table_templates.append(t) if unicode(t.name) == "ru-noun-old": noun_old_templates.append(t) if len(noun_table_templates) > 1: pagemsg("WARNING: Found multiple ru-noun-table templates, skipping") return None if len(noun_old_templates) > 1: pagemsg("WARNING: Found multiple ru-noun-old templates, skipping") return None if len(noun_table_templates) < 1: if noun_old_templates: pagemsg("WARNING: No ru-noun-table templates but found ru-noun-old template(s): %s" % ", ".join(unicode(x) for x in noun_old_templates)) return unicode(parsed), 0, 0, 0, [] for t in parsed.filter_templates(): if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]: pagemsg("Found ru-noun+ or ru-proper noun+, skipping") return None headword_templates = [] for t in parsed.filter_templates(): if unicode(t.name) in ["ru-noun", "ru-proper noun"]: headword_templates.append(t) if len(headword_templates) > 1: pagemsg("WARNING: Found multiple headword templates, skipping") return None if len(headword_templates) < 1: return unicode(parsed), 0, 0, 0, [] noun_table_template = noun_table_templates[0] noun_old_template = noun_old_templates[0] if len(noun_old_templates) == 1 else None headword_template = headword_templates[0] frobbed_manual_translit = [] decl_templates = [x for x in [noun_table_template, noun_old_template] if x] if verbose: pagemsg("Found headword template: %s" % unicode(headword_template)) pagemsg("Found decl template: %s" % unicode(noun_table_template)) if noun_old_template: pagemsg("Found old decl template: %s" % unicode(noun_old_template)) # Retrieve headword translit and maybe transfer to decl headword_tr = getparam(headword_template, "tr") if headword_tr: if verbose: pagemsg("Found headword manual translit tr=%s" % headword_tr) if "," in headword_tr: pagemsg("WARNING: Comma in headword manual translit, skipping: %s" % headword_tr) return None # Punt if multi-arg-set, can't handle yet for decl_template in decl_templates: for param in decl_template.params: if not param.showkey: val = unicode(param.value) if val == "or": pagemsg("WARNING: Manual translit and multi-decl templates, can't handle, skipping: %s" % unicode(decl_template)) return None if val == "-" or val == "_" or val.startswith("join:"): pagemsg("WARNING: Manual translit and multi-word templates, can't handle, skipping: %s" % unicode(decl_template)) return None for i in xrange(2, 10): if getparam(headword_template, "tr%s" % i): pagemsg("WARNING: Headword template has translit param tr%s, can't handle, skipping: %s" % ( i, unicode(headword_template))) return None if runoun.arg1_is_stress(getparam(decl_template, "1")): lemma_arg = "2" else: lemma_arg = "1" lemmaval = getparam(decl_template, lemma_arg) if not lemmaval: lemmaval = subpagetitle if "//" in lemmaval: m = re.search("^(.*?)//(.*)$", lemmaval) if m.group(2) != headword_tr: pagemsg("WARNING: Found existing manual translit in decl template %s, but doesn't match headword translit %s; skipping" % ( lemmaval, headword_tr)) return None else: pagemsg("Already found manual translit in decl template %s" % lemmaval) else: lemmaval += "//" + headword_tr orig_decl_template = unicode(decl_template) decl_template.add(lemma_arg, lemmaval) pagemsg("Replacing decl %s with %s" % (orig_decl_template, unicode(decl_template))) frobbed_manual_translit = [headword_tr] genders = blib.fetch_param_chain(headword_template, "2", "g") bian_replaced = 0 # Change a=bi in decl to a=ia or a=ai, depending on order of anim/inan in # headword template for decl_template in decl_templates: if getparam(decl_template, "a") in ["b", "bi", "bian", "both"]: saw_in = -1 saw_an = -1 for i,g in enumerate(genders): if re.search(r"\bin\b", g) and saw_in < 0: saw_in = i if re.search(r"\ban\b", g) and saw_an < 0: saw_an = i if saw_in >= 0 and saw_an >= 0: orig_decl_template = unicode(decl_template) if saw_in < saw_an: pagemsg("Replacing a=bi with a=ia in decl template") decl_template.add("a", "ia") bian_replaced = 1 else: pagemsg("Replacing a=bi with a=ai in decl template") decl_template.add("a", "ai") bian_replaced = 1 pagemsg("Replacing decl %s with %s" % (orig_decl_template, unicode(decl_template))) generate_template = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args", unicode(noun_table_template)) generate_result = expand_text(generate_template) if not generate_result: pagemsg("WARNING: Error generating noun args, skipping") return None args = ru.split_generate_args(generate_result) genders = runoun.check_old_noun_headword_forms(headword_template, args, subpagetitle, pagemsg) if genders == None: return None new_params = [] for param in noun_table_template.params: new_params.append((param.name, param.value)) orig_headword_template = unicode(headword_template) params_to_preserve = runoun.fix_old_headword_params(headword_template, new_params, genders, pagemsg) if params_to_preserve == None: return None if unicode(headword_template.name) == "ru-proper noun": # If proper noun and n is both then we need to add n=both because # proper noun+ defaults to n=sg if args["n"] == "b" and not getparam(headword_template, "n"): pagemsg("Adding n=both to headword tempate") headword_template.add("n", "both") # Correspondingly, if n is sg then we can usually remove n=sg; # but we need to check that the number is actually sg with n=sg # removed because of the possibility of plurale tantum lemmas if args["n"] == "s": generate_template_with_ndef = generate_template.replace("}}", "|ndef=sg}}") generate_template_with_ndef = re.sub(r"\|n=s[^=|{}]*", "", generate_template_with_ndef) generate_result = expand_text(generate_template_with_ndef) if not generate_result: pagemsg("WARNING: Error generating noun args, skipping") return None ndef_args = ru.split_generate_args(generate_result) if ndef_args["n"] == "s": existing_n = getparam(headword_template, "n") if existing_n and not re.search(r"^s", existing_n): pagemsg("WARNING: Something wrong: Found n=%s, not singular" % existing_n) else: pagemsg("Removing n=sg from headword tempate") rmparam(headword_template, "n") else: pagemsg("WARNING: Unable to remove n= from headword template because n=%s" % ndef_args["n"]) headword_template.params.extend(params_to_preserve) ru_noun_changed = 0 ru_proper_noun_changed = 0 if unicode(headword_template.name) == "ru-noun": headword_template.name = "ru-noun+" ru_noun_changed = 1 else: headword_template.name = "ru-proper noun+" ru_proper_noun_changed = 1 pagemsg("Replacing headword %s with %s" % (orig_headword_template, unicode(headword_template))) return unicode(parsed), ru_noun_changed, ru_proper_noun_changed, bian_replaced, frobbed_manual_translit
def process_page(index, page, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping") return def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) origtext = page.text parsed = blib.parse_text(origtext) def check_lemma(lemma): if lemma in lemma_count: lemma_count[lemma] += 1 if lemma in nonexistent_lemmas: nonexistent_lemmas_refs[lemma].append(pagetitle) else: lemma_count[lemma] = 1 if lemma not in lemmas: page = pywikibot.Page(site, lemma) try: exists = page.exists() except pywikibot.exceptions.InvalidTitle as e: pagemsg("WARNING: Invalid title: %s" % lemma) traceback.print_exc(file=sys.stdout) exists = False if exists: if re.search("#redirect", unicode(page.text), re.I): nonexistent_msg = "exists as redirect" elif re.search(r"\{\{superlative of", unicode(page.text)): nonexistent_msg = "exists as superlative" else: nonexistent_msg = "exists as non-lemma" else: nonexistent_msg = "does not exist" pagemsg("Referenced lemma %s: %s" % (lemma, nonexistent_msg)) nonexistent_lemmas[lemma] = nonexistent_msg nonexistent_lemmas_refs[lemma] = [pagetitle] def process_arg_set(arg_set): if not arg_set: return offset = 0 if re.search(r"^[a-f]'*(,[a-f]'*)*$", arg_set[offset]): offset = 1 if len(arg_set) <= offset: return # Remove * meaning non-stressed lemma = re.sub(r"^\*", "", arg_set[offset]) # Remove translit lemma = re.sub("//.*$", "", lemma) if not lemma: return headwords_separators = re.split(r"(\[\[.*?\]\]|[^ \-]+)", lemma) if headwords_separators[0] != "" or headwords_separators[-1] != "": pagemsg("WARNING: Found junk at beginning or end of headword, skipping: %s" % lemma) return wordind = 0 for i in xrange(1, len(headwords_separators), 2): hword = headwords_separators[i] separator = headwords_separators[i+1] if i < len(headwords_separators) - 2 and separator != " " and separator != "-": pagemsg("WARNING: Separator after word #%s isn't a space or hyphen, can't handle: word=<%s>, separator=<%s>" % (wordind + 1, hword, separator)) continue hword = hword.replace("#Russian", "") hword = rulib.remove_accents(blib.remove_right_side_links(hword)) check_lemma(hword) wordind += 1 def process_new_style_headword(htemp): # Split out the arg sets in the declension and check the # lemma of each one, taking care to handle cases where there is no lemma # (it would default to the page name). highest_numbered_param = 0 for p in htemp.params: pname = unicode(p.name) if re.search("^[0-9]+$", pname): highest_numbered_param = max(highest_numbered_param, int(pname)) # Now split based on arg sets. arg_set = [] for i in xrange(1, highest_numbered_param + 2): end_arg_set = False val = getparam(htemp, str(i)) if (i == highest_numbered_param + 1 or val in ["or", "_", "-"] or re.search("^join:", val)): end_arg_set = True if end_arg_set: process_arg_set(arg_set) arg_set = [] else: arg_set.append(val) for t in parsed.filter_templates(): tname = unicode(t.name) if tname == "ru-decl-noun-see": pagemsg("WARNING: Skipping ru-decl-noun-see, can't handle yet: %s" % unicode(t)) elif tname in ["ru-noun+", "ru-proper noun+"]: pagemsg("Found %s" % unicode(t)) process_new_style_headword(t) elif tname in ["ru-noun", "ru-proper noun"]: pagemsg("WARNING: Skipping ru-noun or ru-proper noun, can't handle yet: %s" % unicode(t))
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping page") return text = unicode(page.text) notes = [] foundrussian = False sections = re.split("(^==[^=]*==\n)", text, 0, re.M) for j in xrange(2, len(sections), 2): if sections[j-1] == "==Russian==\n": if foundrussian: pagemsg("WARNING: Found multiple Russian sections, skipping page") return foundrussian = True # Remove blank form codes and canonicalize position of lang=, tr= parsed = blib.parse_text(sections[j]) for t in parsed.filter_templates(): if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru": origt = unicode(t) # Fetch the numbered params starting with 3, skipping blank ones numbered_params = [] for i in xrange(3,20): val = getparam(t, str(i)) if val: numbered_params.append(val) # Fetch param 1 and param 2, and non-numbered params except lang= # and nocat=. param1 = getparam(t, "1") param2 = getparam(t, "2") tr = getparam(t, "tr") nocat = getparam(t, "nocat") non_numbered_params = [] for param in t.params: pname = unicode(param.name) if not re.search(r"^[0-9]+$", pname) and pname not in ["lang", "nocat", "tr"]: non_numbered_params.append((pname, param.value)) # Erase all params. del t.params[:] # Put back lang, param 1, tr, param 2, then the replacements for the # higher numbered params, then the non-numbered params. t.add("lang", "ru") t.add("1", param1) if tr: t.add("tr", tr) t.add("2", param2) for i, param in enumerate(numbered_params): t.add(str(i+3), param) for name, value in non_numbered_params: t.add(name, value) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("removed any blank form codes and maybe rearranged lang=, tr=") if nocat: notes.append("removed nocat=") sections[j] = unicode(parsed) # Convert 'prep' to 'pre', etc. parsed = blib.parse_text(sections[j]) for t in parsed.filter_templates(): if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru": for frm, to in [ ("nominative", "nom"), ("accusative", "acc"), ("genitive", "gen"), ("dative", "dat"), ("instrumental", "ins"), ("prep", "pre"), ("prepositional", "pre"), ("vocative", "voc"), ("locative", "loc"), ("partitive", "par"), ("singular", "s"), ("(singular)", "s"), ("plural", "p"), ("(plural)", "p"), ("inanimate", "in"), ("animate", "an"), ]: origt = unicode(t) for i in xrange(3,20): val = getparam(t, str(i)) if val == frm: t.add(str(i), to) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("converted '%s' form code to '%s'" % (frm, to)) sections[j] = unicode(parsed) # Rearrange order of s|gen, p|nom etc. to gen|s, nom|p etc. parsed = blib.parse_text(sections[j]) for t in parsed.filter_templates(): if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru": if (getparam(t, "3") in ["s", "p"] and getparam(t, "4") in ["nom", "gen", "dat", "acc", "ins", "pre", "voc", "loc", "par"] and not getparam(t, "5")): origt = unicode(t) number = getparam(t, "3") case = getparam(t, "4") t.add("3", case) t.add("4", number) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("converted '%s|%s' to '%s|%s'" % (number, case, case, number)) sections[j] = unicode(parsed) new_text = "".join(sections) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(blib.group_notes(notes)) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose, fix_missing_plurals): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping") return text = unicode(page.text) notes = [] parsed = blib.parse_text(text) for t in parsed.filter_templates(): origt = unicode(t) name = unicode(t.name) if name == "head" and getparam(t, "1") == "fr": headtype = getparam(t, "2") fixed_plural_warning = False if headtype == "noun": head = getparam(t, "head") g = getparam(t, "g") g2 = getparam(t, "g2") plural = "" if getparam(t, "3") == "plural": plural = getparam(t, "4") unrecognized_params = False for param in t.params: pname = unicode(param.name) if pname in ["1", "2", "head", "g", "g2", "sort"] or plural and pname in ["3", "4"]: pass else: unrecognized_params = True break if unrecognized_params: pagemsg("WARNING: Unrecognized parameters in %s, skipping" % unicode(t)) continue if not g: pagemsg("WARNING: No gender given in %s, skipping" % unicode(t)) continue found_feminine_noun = False if g == "f" and not g2 and not plural: for tt in parsed.filter_templates(): if (unicode(tt.name) == "feminine noun of" and getparam(tt, "lang") == "fr"): found_feminine_noun = True if found_feminine_noun: pagemsg("Found 'feminine noun of', assuming countable") elif g not in ["m-p", "f-p"] and not plural: if fix_missing_plurals: pagemsg("WARNING: No plural given in %s, assuming default plural, PLEASE REVIEW" % unicode(t)) fixed_plural_warning = True else: pagemsg("WARNING: No plural given in %s, skipping" % unicode(t)) continue rmparam(t, "4") rmparam(t, "3") rmparam(t, "2") rmparam(t, "1") rmparam(t, "head") rmparam(t, "g") rmparam(t, "g2") rmparam(t, "sort") t.name = "fr-noun" if head: t.add("head", head) t.add("1", g) if g2: t.add("g2", g2) if plural: t.add("2", plural) elif headtype in ["proper noun", "proper nouns"]: head = getparam(t, "head") g = getparam(t, "g") g2 = getparam(t, "g2") remove_3 = False if not g and getparam(t, "3") in ["m", "f", "m-p", "f-p"]: g = getparam(t, "3") remove_3 = True unrecognized_params = False for param in t.params: pname = unicode(param.name) if pname in ["1", "2", "head", "g", "g2", "sort"] or remove_3 and pname in ["3"]: pass else: unrecognized_params = True break if unrecognized_params: pagemsg("WARNING: Unrecognized parameters in %s, skipping" % unicode(t)) continue if not g: pagemsg("WARNING: No gender given in %s, skipping" % unicode(t)) continue rmparam(t, "3") rmparam(t, "2") rmparam(t, "1") rmparam(t, "head") rmparam(t, "g") rmparam(t, "g2") rmparam(t, "sort") t.name = "fr-proper noun" if head: t.add("head", head) t.add("1", g) if g2: t.add("g2", g2) elif headtype in ["adjective", "adjectives"]: if getparam(t, "3") in ["invariable", "invariant"]: params = dict((unicode(p.name), unicode(p.value)) for p in t.params) del params["1"] del params["2"] del params["3"] if getparam(t, "g") == "m" and getparam(t, "g2") == "f": del params["g"] del params["g2"] if not params: rmparam(t, "g2") rmparam(t, "g") rmparam(t, "3") rmparam(t, "2") rmparam(t, "1") t.name = "fr-adj" t.add("inv", "y") else: pagemsg("WARNING: Unrecognized parameters in %s, skipping" % unicode(t)) else: pagemsg("WARNING: Unrecognized parameters in %s, skipping" % unicode(t)) elif headtype in ["adjective form", "verb form", "verb forms", "interjection", "preposition", "prefix", "prefixes", "suffix", "suffixes"]: headtype_supports_g = headtype in [ "adjective form", "suffix", "suffixes"] head = getparam(t, "head") unrecognized_params = False for param in t.params: pname = unicode(param.name) if pname in ["1", "2", "head", "sort"] or headtype_supports_g and pname == "g": pass else: unrecognized_params = True break if unrecognized_params: pagemsg("WARNING: Unrecognized parameters in %s, skipping" % unicode(t)) continue rmparam(t, "sort") rmparam(t, "head") rmparam(t, "2") rmparam(t, "1") t.name = ("fr-adj-form" if headtype == "adjective form" else "fr-verb-form" if headtype in ["verb form", "verb forms"] else "fr-intj" if headtype == "interjection" else "fr-prep" if headtype == "preposition" else "fr-prefix" if headtype in ["prefix", "prefixes"] else "fr-suffix" # if headtype in ["suffix", "suffixes"] ) if head: t.add("head", head) newt = unicode(t) if origt != newt: pagemsg("Replacing %s with %s" % (origt, newt)) notes.append("replaced {{head|fr|%s}} with {{%s}}%s" % (headtype, unicode(t.name), " (NEEDS REVIEW)" if fixed_plural_warning else "")) newtext = unicode(parsed) if newtext != text: assert notes comment = "; ".join(notes) if save: pagemsg("Saving with comment = %s" % comment) page.text = newtext blib.try_repeatedly(lambda: page.save(comment=comment), pagemsg, "save page") else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping page") return text = unicode(page.text) notes = [] already_canonicalized = False found_short_inflection_of = False warned_about_short = False foundrussian = False sections = re.split("(^==[^=]*==\n)", text, 0, re.M) for j in xrange(2, len(sections), 2): if sections[j-1] == "==Russian==\n": if foundrussian: pagemsg("WARNING: Found multiple Russian sections, skipping page") return foundrussian = True # Try to canonicalize existing 'inflection of' parsed = blib.parse_text(sections[j]) for t in parsed.filter_templates(): if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru": # Fetch the numbered params starting with 3 numbered_params = [] for i in xrange(3,20): numbered_params.append(getparam(t, str(i))) while len(numbered_params) > 0 and not numbered_params[-1]: del numbered_params[-1] # Now canonicalize numparamstr = "/".join(numbered_params) canon_params = [] while True: m = (re.search(r"^([mfn])/(?:s|\(singular\))/short(?: form|)$", numparamstr) or re.search(r"^(?:s|\(singular\))/([mfn])/short(?: form|)$", numparamstr) or re.search(r"^short(?: form|)/([mfn])/(?:s|\(singular\))$", numparamstr) or re.search(r"^short(?: form|)/(?:s|\(singular\))/([mfn])$", numparamstr) or re.search(r"^([mfn])/short(?: form|)/(?:s|\(singular\))$", numparamstr) or re.search(r"^(?:s|\(singular\))/short(?: form|)/([mfn])$", numparamstr) or re.search(r"^([mfn])/short(?: form|)$", numparamstr) or re.search(r"^short(?: form|)/([mfn])$", numparamstr) ) if m: found_short_inflection_of = True canon_params = ["short", m.group(1), "s"] break m = (re.search(r"^(?:p|\(plural\))/short(?: form|)$", numparamstr) or re.search(r"^short(?: form|)/(?:p|\(plural\))$", numparamstr) ) if m: found_short_inflection_of = True canon_params = ["short", "p"] break if "short" in numbered_params or "short form" in numbered_params: found_short_inflection_of = True warned_about_short = True pagemsg("WARNING: Apparent short-form 'inflection of' but can't canonicalize: %s" % unicode(t)) break if canon_params: origt = unicode(t) # Fetch param 1 and param 2. Erase all numbered params. # Put back param 1 and param 2 (this will put them after lang=ru), # then the replacements for the higher params. param1 = getparam(t, "1") param2 = getparam(t, "2") for i in xrange(19,0,-1): rmparam(t, str(i)) t.add("1", param1) t.add("2", param2) for i, param in enumerate(canon_params): t.add(str(i+3), param) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("canonicalized 'inflection of' for %s" % "/".join(canon_params)) else: pagemsg("Apparently already canonicalized: %s" % newt) already_canonicalized = True sections[j] = unicode(parsed) # Try to add 'inflection of' to raw-specified singular inflection def add_sing_inflection_of(m): prefix = m.group(1) gender = {"masculine":"m", "male":"m", "feminine":"f", "female":"f", "neuter":"n", "neutral":"n"}[m.group(2).lower()] lemma = m.group(3) retval = prefix + "{{inflection of|lang=ru|%s||short|%s|s}}" % (lemma, gender) pagemsg("Replaced <%s> with %s" % (m.group(0), retval)) notes.append("converted raw to 'inflection of' for short/%s/s" % gender) return retval newsec = re.sub(r"(# |\()'*(?:short |)(?:form of |)(masculine|male|feminine|female|neuter|neutral) (?:short |)(?:singular |)(?:short |)(?:form of|of|for)'* '*(?:\[\[|\{\{[lm]\|ru\|)(.*?)(?:\]\]|\}\})'*", add_sing_inflection_of, sections[j], 0, re.I) if newsec != sections[j]: found_short_inflection_of = True sections[j] = newsec if "short" in sections[j] and not found_short_inflection_of: m = re.search("^(.*short.*)$", sections[j], re.M) warned_about_short = True pagemsg("WARNING: Apparent raw-text short inflection, not converted: %s" % (m and m.group(1) or "Can't get line?")) new_text = "".join(sections) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(blib.group_notes(notes)) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment) if not notes and not already_canonicalized: pagemsg("Skipping, no short form found%s" % ( warned_about_short and " (warning issued)" or " (no warning)"))
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping") return text = unicode(page.text) def check_bad_head(text, arg): canontext = re.sub(u"[׳’]", "'", blib.remove_links(text)) canonpagetitle = re.sub(u"[׳’]", "'", pagetitle) if canontext != canonpagetitle: pagemsg("WARNING: Canonicalized %s=%s not same as canonicalized page title %s (orig %s=%s)" % (arg, canontext, canonpagetitle, arg, text)) notes = [] parsed = blib.parse_text(text) for t in parsed.filter_templates(): origt = unicode(t) name = unicode(t.name) if name in fr_head_templates: head = getparam(t, "head") if head: linked_pagetitle = link_text(pagetitle) linked_head = link_text(head) if linked_pagetitle == linked_head: pagemsg("Removing redundant head=%s" % head) rmparam(t, "head") notes.append("remove redundant head= from {{%s}}" % name) else: pagemsg("Not removing non-redundant head=%s" % head) check_bad_head(head, "head") if name in fr_head_or_1_templates: head = getparam(t, "1") if head: linked_pagetitle = link_text(pagetitle) linked_head = link_text(head) if linked_pagetitle == linked_head: pagemsg("Removing redundant 1=%s" % head) rmparam(t, "1") notes.append("remove redundant 1= from {{%s}}" % name) else: pagemsg("Not removing non-redundant 1=%s" % head) check_bad_head(head, "1") newt = unicode(t) if origt != newt: pagemsg("Replacing %s with %s" % (origt, newt)) newtext = unicode(parsed) if newtext != text: assert notes comment = "; ".join(notes) if save: pagemsg("Saving with comment = %s" % comment) page.text = newtext page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def convert_zdecl_to_ru_noun_table(decl_z_template, subpagetitle, pagemsg, headword_template=None): zdecl = unicode(decl_z_template) zdeclcopy = blib.parse_text(zdecl).filter_templates()[0] decl_template = blib.parse_text("{{ru-noun-table}}").filter_templates()[0] # {{ru-decl-noun-z|звезда́|f-in|d|ё}} # {{ru-decl-noun-z|ёж|m-inan|b}} def getp(param): rmparam(zdeclcopy, param) return getparam(decl_z_template, param).strip() zlemma = getp("1") zgender_anim = getp("2") zstress = getp("3") zspecial = re.sub(u"ё", u";ё", getp("4")) m = re.search(r"^([mfn])-(an|in|inan)$", zgender_anim) if not m: pagemsg("WARNING: Unable to recognize z-decl gender/anim spec, skipping: %s" % zgender_anim) return None zgender, zanim = m.groups() if not zlemma: pagemsg("WARNING: Empty lemma, skipping: %s" % zdecl) return None # Remove unnecessary gender need_gender = (re.search(u"[иы]́?$", zlemma) or zgender == "n" and re.search(u"[яа]́?$", zlemma) or zgender == "m" and re.search(u"[яа]́?$", zlemma) and "(1)" in zspecial or zlemma.endswith(u"ь")) if not need_gender: normal_gender = (re.search(u"[оеё]́?$", zlemma) and "n" or re.search(u"[ая]́?$", zlemma) and "f" or "m") if normal_gender != zgender: pagemsg("WARNING: Gender mismatch, normal gender=%s, explicit gender=%s, keeping gender" % (normal_gender, zgender)) need_gender = True if need_gender: pagemsg("Preserving gender in z-decl: %s" % zdecl) zspecial = zgender + zspecial else: pagemsg("Not preserving gender in z-decl: %s" % zdecl) # Remove unnecessary stress stressed_lemma = ru.try_to_stress(zlemma) def check_defstress(defstr, reason): if defstr == zstress: pagemsg("Removing stress %s as default because %s: stressed_lemma=%s, template=%s" % (defstr, reason, stressed_lemma, zdecl)) return defstr if ru.is_nonsyllabic(stressed_lemma): default_stress = check_defstress("b", "nonsyllabic lemma") elif re.search(u"([аяоеыи]́|ё́?)$", stressed_lemma): default_stress = check_defstress("b", "ending-accented lemma") # No need for special-casing for ёнок or а́нин, as they are considered # accent a by ru-decl-noun-z else: default_stress = check_defstress("a", "stem-accented lemma") if default_stress == zstress: zstress = "" else: pagemsg("Not removing stress %s: %s" % (zstress, zdecl)) # Remove unnecessary lemma if ru.try_to_stress(subpagetitle) == stressed_lemma: pagemsg(u"Removing lemma %s because identical to subpagetitle %s (modulo monosyllabic stress differences): %s" % (zlemma, subpagetitle, zdecl)) zlemma = "" if zstress: decl_template.add("1", zstress) offset = 1 else: offset = 0 decl_template.add(str(1 + offset), zlemma) decl_template.add(str(2 + offset), zspecial) if not getparam(decl_template, "3"): rmparam(decl_template, "3") if not getparam(decl_template, "2"): rmparam(decl_template, "2") if not getparam(decl_template, "1"): rmparam(decl_template, "1") headword_anim_spec = headword_template and extract_headword_anim_spec(headword_template) def anim_mismatch(zdecl_an, allowed_headword_ans): if headword_anim_spec and headword_anim_spec not in allowed_headword_ans: pagemsg("WARNING: z-decl anim %s disagrees with headword-derived %s (%s allowed): zdecl=%s, headword=%s" % (zdecl_an, headword_anim_spec, ",".join(allowed_headword_ans), zdecl, unicode(headword_template))) if zanim == "an": anim_mismatch(zanim, ["an"]) pagemsg("Preserving z-decl -an as a=an: %s" % zdecl) decl_template.add("a", "an") elif zanim == "inan": anim_mismatch(zanim, ["ai", "ia"]) if headword_anim_spec in ["ai", "ia"]: pagemsg("Converting z-decl -inan to a=%s: %s" % (headword_anim_spec, zdecl)) decl_template.add("a", headword_anim_spec) else: pagemsg("WARNING: Unable to convert z-decl -inan to a=ai or a=ia, preserving as a=bi: zdecl=%s, headword=%s" % (zdecl, unicode(headword_template or "(no headword)"))) decl_template.add("a", "bi") else: assert(zanim == "in") anim_mismatch(zanim, ["in"]) pagemsg("Dropping z-decl -in as default: %s" % zdecl) znum = getp("n") if znum: if znum == "pl": pagemsg("WARNING: Found n=pl in z-decl, should convert manually to plural lemma: %s" % zdecl) pagemsg("Preserving z-decl n=%s: %s" % (znum, zdecl)) decl_template.add("n", znum) preserve_params = [ 'nom_sg', 'gen_sg', 'dat_sg', 'acc_sg', 'ins_sg', 'prp_sg', 'nom_pl', 'gen_pl', 'dat_pl', 'acc_pl', 'ins_pl', 'prp_pl', 'voc' ] renamed_params = {'prp_sg':'pre_sg', 'prp_pl':'pre_pl'} for param in preserve_params: val = getp(param) if not val: continue newval = fixup_link(val) newvals = re.split(r"\s*,\s*", newval) newvals = [re.sub(r"^\[\[([^\[\]|]*)\]\]$", r"\1", x) for x in newvals] newval= ",".join(newvals) newparam = renamed_params.get(param, param) pagemsg("Preserving z-decl override %s=%s%s%s: %s" % ( newparam, newval, "" if newparam == param else "; renamed from %s" % param, "" if newval == val else "; canonicalized from %s=%s" % (param, val), zdecl)) decl_template.add(newparam, newval) loc = getp("loc") if loc: if loc == u"в": newloc = u"в +" elif loc == u"на": newloc = u"на +" else: newloc = u"в/на +" pagemsg("Preserving z-decl locative loc=%s (canonicalized from loc=%s): %s" % (newloc, loc, zdecl)) decl_template.add("loc", newloc) par = getp("par") if par: newpar="+" pagemsg("Preserving z-decl partitive par=%s (canonicalized from par=%s): %s" % (newpar, par, zdecl)) decl_template.add('par', newpar) notes = getp("note") if notes: pagemsg("WARNING: Found z-decl note=<%s>, converting to notes= but probably needs fixing up with footnote symbol and pltail or similar: %s" % (notes, zdecl)) decl_template.add('notes', notes) if zdeclcopy.params: pagemsg("WARNING: Extraneous params in z-decl: %s" % unicode(zdeclcopy)) #pagemsg("Replacing z-decl %s with regular decl %s" % # (zdecl, unicode(decl_template))) return decl_template
def split_one_page_etymologies(page, index, pagetext, verbose): # Fetch pagename, create pagemsg() fn to output msg with page name included pagename = page.title() pagetext = unicode(pagetext) def pagemsg(text): msg("Page %s %s: %s" % (index, pagename, text)) comment = None notes = [] # Split off interwiki links at end m = re.match(r"^(.*?\n)(\n*(\[\[[a-z0-9_\-]+:[^\]]+\]\]\n*)*)$", pagetext, re.S) if m: pagebody = m.group(1) pagetail = m.group(2) else: pagebody = pagetext pagetail = "" # Split into sections splitsections = re.split("(^==[^=\n]+==\n)", pagebody, 0, re.M) # Extract off pagehead and recombine section headers with following text pagehead = splitsections[0] sections = [] for i in xrange(1, len(splitsections)): if (i % 2) == 1: sections.append("") sections[-1] += splitsections[i] # Go through each section in turn, looking for existing Arabic section for i in xrange(len(sections)): m = re.match("^==([^=\n]+)==$", sections[i], re.M) if not m: pagemsg("WARNING: Can't find language name in text: [[%s]]" % (sections[i])) elif m.group(1) == "Arabic": # Extract off trailing separator mm = re.match(r"^(.*?\n)(\n*--+\n*)$", sections[i], re.S) if mm: sections[i:i+1] = [mm.group(1), mm.group(2)] elif i < len(sections) - 1: pagemsg("WARNING: Arabic language section %s is non-final and missing trailing separator" % i) for mm in re.finditer("^(==+)[^=\n](==+)$", sections[i], re.M): if mm.group(1) != mm.group(2): pagemsg("WARNING: Malconstructed header: %s" % mm.group(0)) subsections = re.split("(^===[^=\n]+=+\n)", sections[i], 0, re.M) if len(subsections) < 2: pagemsg("WARNING: Page missing any entries") etymologies = [] etymsections = [] sechead = subsections[0] if "\n===Etymology 1=" in sections[i]: etyms_were_separate = True for j in xrange(1, len(subsections), 2): if not re.match("^===Etymology [0-9]+=", subsections[j]): pagemsg("WARNING: Non-etymology level-3 header when split etymologies: %s" % subsections[j][0:-1]) etymsections = [subsections[j] for j in xrange(2, len(subsections), 2)] # Reduce indent by one. We will increase it again when we split # etymologies. for j in xrange(len(etymsections)): etymsections[j] = re.sub("^==", "=", etymsections[j], 0, re.M) else: etyms_were_separate = False etymsections = ''.join(subsections[1:]) for etymsection in etymsections: subsections = re.split("(^===[^=\n]+=+\n)", etymsection, 0, re.M) if len(subsections) < 2: pagemsg("WARNING: Section missing any entries") split_sections = [] next_split_section = 0 def append_section(k): while len(split_sections) <= next_split_section: split_sections.append("") split_sections[next_split_section] += \ subsections[k] + subsections[k + 1] last_lemma = None last_inflection_of_lemma = None for j in xrange(1, len(subsections), 2): if re.match("^===+(References|Related|See)", subsections[j]): pagemsg("Found level-3 section that should maybe be at higher level: %s" % subsections[j][0:-1]) append_section(j) elif re.match("^===+(Alternative|Etymology)", subsections[j]): append_section(j) else: parsed = blib.parse_text(subsections[j + 1]) lemma = None inflection_of_lemma = None for t in parsed.filter_templates(): if t.name in arabic_all_headword_templates: if lemma: if t.name not in ["ar-nisba", "ar-noun-nisba", "ar-verb", "ar-verb-form"]: pagemsg("Found multiple headword templates in section %s: %s" % (j, subsections[j][0:-1])) # Note: For verbs this is the form class, which we match on lemma = reorder_shadda(remove_links(getparam(t, "1"))) if t.name == "inflection of": if inflection_of_lemma: pagemsg("Found multiple 'inflection of' templates in section %s: %s" % (j, subsections[j][0:-1])) inflection_of_lemma = remove_diacritics( remove_links(getparam(t, "1"))) if not lemma: pagemsg("Warning: No headword template in section %s: %s" % (j, subsections[j][0:-1])) append_section(j) else: if lemma != last_lemma: next_split_section += 1 elif (inflection_of_lemma and last_inflection_of_lemma and inflection_of_lemma != last_inflection_of_lemma): pagemsg("Verb forms have different inflection-of lemmas %s and %s, splitting etym" % ( last_inflection_of_lemma, inflection_of_lemma)) next_split_section += 1 last_lemma = lemma last_inflection_of_lemma = inflection_of_lemma append_section(j) etymologies += split_sections # Combine adjacent etymologies with same verb form class I. # FIXME: We might not want to do this; the etymologies might be # legitimately split. Need to check each case. j = 0 while j < len(etymologies) - 1: def get_form_class(k): formclass = None parsed = blib.parse_text(etymologies[j]) for t in parsed.filter_templates(): if t.name in ["ar-verb", "ar-verb-form"]: newformclass = getparam(t, "1") if formclass and newformclass and formclass != newformclass: pagemsg("WARNING: Something wrong: Two different verb form classes in same etymology: %s != %s" % (formclass, newformclass)) formclass = newformclass return formclass formclassj = get_form_class(j) formclassj1 = get_form_class(j + 1) if formclassj == "I" and formclassj1 == "I": if not etymologies[j + 1].startswith("="): pagemsg("WARNING: Can't combine etymologies with same verb form class because second has etymology text") else: pagemsg("Combining etymologies with same verb form class I") etymologies[j] = etymologies[j].rstrip() + "\n\n" + etymologies[j + 1] # Cancel out effect of incrementing j below since we combined # the following etymology into this one j -= 1 j += 1 if len(etymologies) > 1: for j in xrange(len(etymologies)): # Stuff like "===Alternative forms===" that goes before the # etymology section should be moved after. newetymj = re.sub(r"^(.*?\n)(===Etymology===\n(\n|[^=\n].*?\n)*)", r"\2\1", etymologies[j], 0, re.S) if newetymj != etymologies[j]: pagemsg("Moved ===Alternative forms=== and such after Etymology") etymologies[j] = newetymj # Remove ===Etymology=== from beginning etymologies[j] = re.sub("^===Etymology===\n", "", etymologies[j]) # Fix up newlines around etymology section etymologies[j] = etyomologies[j].strip() + "\n\n" if etymologies[j].startswith("="): etymologies[j] = "\n" + etymologies[j] sections[i] = (sechead + ''.join(["===Etymology %s===\n" % (j + 1) + etymologies[j] for j in xrange(len(etymologies))])) elif len(etymologies) == 1: if etyms_were_separate: # We might need to add an Etymology header at the beginning. pagemsg("Combined formerly separate etymologies") if not re.match(r"^(=|\{\{wikipedia|\[\[File:)", etymologies[0].strip()): etymologies[0] = "===Etymology===\n" + etymologies[0] pagemsg("Added Etymology header when previously separate etymologies combined") # Put Alternative forms section before Etymology. newetym0 = re.sub(r"^((?:\n|[^=\n].*?\n)*)(===Etymology===\n(?:\n|[^=\n].*?\n)*)(===(Alternative.*?)===\n(?:\n|[^=\n].*?\n)*)", r"\1\3\2", etymologies[0], 0, re.S) if newetym0 != etymologies[0]: pagemsg("Moved ===Alternative forms=== and such before Etymology") etymologies[0] = newetym0 sections[i] = sechead + etymologies[0] else: sections[i] = sechead break # End of loop over sections in existing page; rejoin sections newtext = pagehead + ''.join(sections) + pagetail # Don't signal a save if only differences are whitespace at end, # since it appears that newlines at end get stripped when saving. if pagetext.rstrip() == newtext.rstrip(): pagemsg("No change in text") else: if verbose: pagemsg("Replacing [[%s]] with [[%s]]" % (pagetext, newtext)) else: pagemsg("Text has changed") pagetext = newtext # Construct and output comment. notestext = '; '.join(notes) if notestext: if comment: comment += " (%s)" % notestext else: comment = notestext assert(comment) pagemsg("comment = %s" % comment, simple = True) return pagetext, comment
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping") return text = unicode(page.text) notes = [] parsed = blib.parse_text(text) for t in parsed.filter_templates(): name = unicode(t.name) if name in templates_to_change or name in refl_templates_to_change: refl = name in refl_templates_to_change difvals = compare_conjugation(index, page, t, refl, pagemsg, expand_text, verbose) if difvals is None: pass elif difvals: difprops = [] for prop, (oldval, newval) in difvals: difprops.append("%s=%s vs. %s" % (prop, oldval or "(missing)", newval or "(missing)")) pagemsg("WARNING: Different conjugation when changing template %s to {{fr-conj-auto}}: %s" % (unicode(t), "; ".join(difprops))) else: aux = "" for param in t.params: pname = unicode(param.name) pval = unicode(param.value) if not pval.strip(): continue if (pname not in ["1", "2", "3", "aux", "sort", "cat"] or pname == "3" and pval not in ["avoir", "être", "avoir or être"]): pagemsg("WARNING: Found extra param %s=%s in %s" % (pname, pval, unicode(t))) if pname == "aux" and pval != "avoir": aux = pval pagemsg("Found non-avoir auxiliary aux=%s in %s" % ( pval, unicode(t))) auxpname = ("3" if name in ["fr-conj-e-er", "fr-conj-ir (s)"] else "aux" if name in ["fr-conj-xx-er", "fr-conj-é-er"] else "2") if pname == auxpname and pval != "avoir": aux = pval pagemsg("Found non-avoir auxiliary %s=%s in %s" % ( pname, pval, unicode(t))) oldt = unicode(t) del t.params[:] t.name = "fr-conj-auto" if refl: t.add("refl", "yes") if aux: t.add("aux", aux) newt = unicode(t) pagemsg("Replacing %s with %s" % (oldt, newt)) notes.append("replaced {{%s}} with %s" % (name, newt)) newtext = unicode(parsed) if newtext != text: assert notes comment = "; ".join(notes) if save: pagemsg("Saving with comment = %s" % comment) page.text = newtext page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping page") return text = unicode(page.text) notes = [] parsed = blib.parse_text(text) for t in parsed.filter_templates(): origt = unicode(t) if unicode(t.name) == "head" and getparam(t, "1") == "ru" and getparam(t, "2") == "noun form": if getparam(t, "3"): pagemsg("WARNING: Found param 3 in {{head|ru|noun form}}: %s" % unicode(t)) return rmparam(t, "1") rmparam(t, "2") head = getrmparam(t, "head") head2 = getrmparam(t, "head2") tr = getrmparam(t, "tr") tr2 = getrmparam(t, "tr2") g = getrmparam(t, "g") g2 = getrmparam(t, "g2") g3 = getrmparam(t, "g3") if len(t.params) > 0: pagemsg("WARNING: Extra params in noun form template: %s" % unicode(t)) return t.name = "ru-noun form" if head or g: t.add("1", head) if head2: t.add("head2", head2) if g: t.add("2", g) if g2: t.add("g2", g2) if g3: t.add("g3", g3) if tr: t.add("tr", tr) if tr2: t.add("tr2", tr2) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("convert {{head|ru|noun form}} to {{ru-noun form}}") elif unicode(t.name) == "ru-noun form": if getparam(t, "head") and getparam(t, "1"): pagemsg("WARNING: ru-noun form has both params 1= and head=: %s" % unicode(t)) return if getparam(t, "g") and getparam(t, "2"): pagemsg("WARNING: ru-noun form has both params 2= and g=: %s" % unicode(t)) return head = getrmparam(t, "1") or getrmparam(t, "head") head2 = getrmparam(t, "head2") tr = getrmparam(t, "tr") tr2 = getrmparam(t, "tr2") g = getrmparam(t, "2") or getrmparam(t, "g") g2 = getrmparam(t, "g2") g3 = getrmparam(t, "g3") if len(t.params) > 0: pagemsg("WARNING: Extra params in noun form template: %s" % unicode(t)) return if head or g: t.add("1", head) if head2: t.add("head2", head2) if g: t.add("2", g) if g2: t.add("g2", g2) if g3: t.add("g3", g3) if tr: t.add("tr", tr) if tr2: t.add("tr2", tr2) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("canonicalize ru-noun form") new_text = unicode(parsed) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(blib.group_notes(notes)) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping") return text = unicode(page.text) notes = [] parsed = blib.parse_text(text) for t in parsed.filter_templates(): origt = unicode(t) name = unicode(t.name) if unicode(t.name) == "fr-adj": g = getparam(t, "1") if g and g != "mf": pagemsg("WARNING: Strange value 1=%s, removing: %s" % (g, unicode(t))) rmparam(t, "1") notes.append("remove bogus 1=%s" % g) g = None inv = getparam(t, "inv") if inv: if inv not in ["y", "yes", "1"]: pagemsg("WARNING: Strange value inv=%s: %s" % (inv, unicode(t))) if (getparam(t, "1") or getparam(t, "f") or getparam(t, "mp") or getparam(t, "fp") or getparam(t, "p")): pagemsg("WARNING: Found extraneous params with inv=: %s" % unicode(t)) continue if (getparam(t, "f2") or getparam(t, "mp2") or getparam(t, "fp2") or getparam(t, "p2")): pagemsg("Skipping multiple feminines or plurals: %s" % unicode(t)) continue expected_mp = (pagetitle if re.search("[sx]$", pagetitle) else re.sub("al$", "aux", pagetitle) if pagetitle.endswith("al") else pagetitle + "s") if getparam(t, "mp") == expected_mp: rmparam(t, "mp") notes.append("remove redundant mp=") expected_fem = (pagetitle if pagetitle.endswith("e") else pagetitle + "ne" if pagetitle.endswith("en") else re.sub("er$", u"ère", pagetitle) if pagetitle.endswith("er") else pagetitle + "le" if pagetitle.endswith("el") else pagetitle + "ne" if pagetitle.endswith("on") else pagetitle + "te" if pagetitle.endswith("et") else pagetitle + "e" if pagetitle.endswith("ieur") else re.sub("teur$", "trice", pagetitle) if pagetitle.endswith("teur") else re.sub("eur$", "euse", pagetitle) if pagetitle.endswith("eur") else re.sub("eux$", "euse", pagetitle) if pagetitle.endswith("eux") else re.sub("if$", "ive", pagetitle) if pagetitle.endswith("if") else re.sub("c$", "que", pagetitle) if pagetitle.endswith("c") else pagetitle + "e") if re.search("(el|on|et|[^i]eur|eux|if|c)$", pagetitle) and not getparam(t, "f") and g != "mf": pagemsg("WARNING: Found suffix -el/-on/-et/-[^i]eur/-eux/-if/-c and no f= or 1=mf: %s" % unicode(t)) if getparam(t, "f") == expected_fem: rmparam(t, "f") notes.append("remove redundant f=") fem = getparam(t, "f") or expected_fem if not fem.endswith("e"): if not getparam(t, "fp"): pagemsg("WARNING: Found f=%s not ending with -e and no fp=: %s" % (fem, unicode(t))) continue expected_fp = fem + "s" if getparam(t, "fp") == expected_fp: rmparam(t, "fp") notes.append("remove redundant fp=") if getparam(t, "fp") and not getparam(t, "f"): pagemsg("WARNING: Found fp=%s and no f=: %s" % (getparam(t, "fp"), unicode(t))) continue if getparam(t, "fp") == fem: pagemsg("WARNING: Found fp=%s same as fem=%s: %s" % (getparam(t, "fp"), fem, unicode(t))) continue if pagetitle.endswith("e") and not getparam(t, "f") and not getparam(t, "fp"): if g == "mf": rmparam(t, "1") notes.append("remove redundant 1=mf") g = "mf" if g == "mf": f = getparam(t, "f") if f: pagemsg("WARNING: Found f=%s and 1=mf: %s" % (f, unicode(t))) mp = getparam(t, "mp") if mp: pagemsg("WARNING: Found mp=%s and 1=mf: %s" % (mp, unicode(t))) fp = getparam(t, "fp") if fp: pagemsg("WARNING: Found fp=%s and 1=mf: %s" % (fp, unicode(t))) if f or mp or fp: continue expected_p = (pagetitle if re.search("[sx]$", pagetitle) else re.sub("al$", "aux", pagetitle) if pagetitle.endswith("al") else pagetitle + "s") if getparam(t, "p") == expected_p: rmparam(t, "p") notes.append("remove redundant p=") elif getparam(t, "p"): pagemsg("WARNING: Found unexpected p=%s: %s" % (getparam(t, "p"), unicode(t))) if not re.search("[ -]", pagetitle) and (getparam(t, "f") or getparam(t, "mp") or getparam(t, "fp") or getparam(t, "p")): pagemsg("Found remaining explicit feminine or plural in single-word base form: %s" % unicode(t)) newt = unicode(t) if origt != newt: pagemsg("Replacing %s with %s" % (origt, newt)) newtext = unicode(parsed) if newtext != text: assert notes comment = "; ".join(notes) if save: pagemsg("Saving with comment = %s" % comment) page.text = newtext page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if ":" in pagetitle: pagemsg("WARNING: Colon in page title, skipping page") return text = unicode(page.text) notes = [] foundrussian = False sections = re.split("(^==[^=]*==\n)", text, 0, re.M) for j in xrange(2, len(sections), 2): if sections[j-1] == "==Russian==\n": if foundrussian: pagemsg("WARNING: Found multiple Russian sections, skipping page") return foundrussian = True # Try to canonicalize existing 'conjugation of' parsed = blib.parse_text(sections[j]) for t in parsed.filter_templates(): if unicode(t.name) == "conjugation of" and getparam(t, "lang") == "ru": origt = unicode(t) t.name = "inflection of" newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("converted 'conjugation of' to 'inflection of'") sections[j] = unicode(parsed) # Try to split 'inflection of' containing 'present or future' into two # defns newsec = re.sub(r"^# \{\{inflection of\|(.*?)\|present or future\|(.*?)\}\}$", r"# {{inflection of|\1|pres|\2}}\n# {{inflection of|\1|fut|\2}}", sections[j], 0, re.M) if newsec != sections[j]: notes.append("split 'present or future' form code into two defns with 'pres' and 'fut'") sections[j] = newsec # Convert 'indc' to 'ind', 'futr' to 'fut', 'perfective' and # '(perfective)' to 'pfv', 'imperfective' and '(imperfective)' to 'impfv', # 'impr' to 'imp' parsed = blib.parse_text(sections[j]) for t in parsed.filter_templates(): if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru": for frm, to in [ ("indc", "ind"), ("indicative", "ind"), ("futr", "fut"), ("future", "fut"), ("impr", "imp"), ("imperative", "imp"), ("perfective", "pfv"), ("(perfective)", "pfv"), ("imperfective", "impfv"), ("(imperfective)", "impfv"), ("singular", "s"), ("(singular)", "s"), ("plural", "p"), ("(plural)", "p"), ("masculine", "m"), ("(masculine)", "m"), ("feminine", "f"), ("(feminine)", "f"), ("neuter", "n"), ("(neuter)", "n"), ("neutral", "n"), ("(neutral)", "n"), ]: origt = unicode(t) for i in xrange(3,20): val = getparam(t, str(i)) if val == frm: t.add(str(i), to) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("converted '%s' form code to '%s'" % (frm, to)) sections[j] = unicode(parsed) # Remove blank form codes and canonicalize position of lang=, tr= parsed = blib.parse_text(sections[j]) for t in parsed.filter_templates(): if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru": origt = unicode(t) # Fetch the numbered params starting with 3, skipping blank ones numbered_params = [] for i in xrange(3,20): val = getparam(t, str(i)) if val: numbered_params.append(val) # Fetch param 1 and param 2, and non-numbered params except lang= # and nocat=. param1 = getparam(t, "1") param2 = getparam(t, "2") tr = getparam(t, "tr") nocat = getparam(t, "nocat") non_numbered_params = [] for param in t.params: pname = unicode(param.name) if not re.search(r"^[0-9]+$", pname) and pname not in ["lang", "nocat", "tr"]: non_numbered_params.append((pname, param.value)) # Erase all params. del t.params[:] # Put back lang, param 1, param 2, tr, then the replacements for the # higher numbered params, then the non-numbered params. t.add("lang", "ru") t.add("1", param1) t.add("2", param2) if tr: t.add("tr", tr) for i, param in enumerate(numbered_params): t.add(str(i+3), param) for name, value in non_numbered_params: t.add(name, value) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("removed any blank form codes and maybe rearranged lang=, tr=") if nocat: notes.append("removed nocat=") sections[j] = unicode(parsed) # Try to canonicalize 'inflection of' involving the imperative, # present, future parsed = blib.parse_text(sections[j]) for t in parsed.filter_templates(): if unicode(t.name) == "inflection of" and getparam(t, "lang") == "ru": # Fetch the numbered params starting with 3 numbered_params = [] for i in xrange(3,20): val = getparam(t, str(i)) if val: numbered_params.append(val) while len(numbered_params) > 0 and not numbered_params[-1]: del numbered_params[-1] # Now canonicalize numparamstr = "/".join(numbered_params) numparamset = set(numbered_params) canon_params = [] while True: if numparamset == {'s', 'pfv', 'imp'}: canon_params = ['2', 's', 'pfv', 'imp'] elif numparamset == {'s', 'impfv', 'imp'}: canon_params = ['2', 's', 'impfv', 'imp'] elif numparamset == {'s', 'imp'}: canon_params = ['2', 's', 'imp'] elif numparamset == {'p', 'pfv', 'imp'}: canon_params = ['2', 'p', 'pfv', 'imp'] elif numparamset == {'p', 'impfv', 'imp'}: canon_params = ['2', 'p', 'impfv', 'imp'] elif numparamset == {'p', 'imp'}: canon_params = ['2', 'p', 'imp'] elif numparamset == {'m', 's', 'past'}: canon_params = ['m', 's', 'past', 'ind'] elif numparamset == {'f', 's', 'past'}: canon_params = ['f', 's', 'past', 'ind'] elif numparamset == {'n', 's', 'past'}: canon_params = ['n', 's', 'past', 'ind'] elif numparamset == {'p', 'past'}: canon_params = ['p', 'past', 'ind'] else: m = re.search(r"^([123])/([sp])/(pres|fut)$", numparamstr) if m: canon_params = [m.group(1), m.group(2), m.group(3), "ind"] break if canon_params: origt = unicode(t) # Fetch param 1 and param 2. Erase all numbered params. # Put back param 1 and param 2 (this will put them after lang=ru), # then the replacements for the higher params. param1 = getparam(t, "1") param2 = getparam(t, "2") for i in xrange(19,0,-1): rmparam(t, str(i)) t.add("1", param1) t.add("2", param2) for i, param in enumerate(canon_params): t.add(str(i+3), param) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) notes.append("canonicalized 'inflection of' for %s" % "/".join(canon_params)) else: pagemsg("Apparently already canonicalized: %s" % newt) sections[j] = unicode(parsed) # Try to add 'inflection of' to raw-specified participial inflection def add_participle_inflection_of(m): prefix = m.group(1) tense = m.group(2).lower() if tense == "present": tense = "pres" voice = m.group(3).lower() if voice == "active": voice = "act" elif voice == "passive": voice = "pass" elif voice == "adverbial": voice = "adv" lemma = m.group(4) retval = prefix + "{{inflection of|lang=ru|%s||%s|%s|part}}" % (lemma, tense, voice) pagemsg("Replaced <%s> with %s" % (m.group(0), retval)) notes.append("converted raw to 'inflection of' for %s/%s/part" % (tense, voice)) return retval newsec = re.sub(r"(# |\()'*(present|past) participle (active|passive|adverbial) of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_participle_inflection_of, sections[j], 0, re.I) newsec = re.sub(r"(# |\()'*(present|past) (active|passive|adverbial) participle of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_participle_inflection_of, newsec, 0, re.I) sections[j] = newsec # Try to add 'inflection of' to raw-specified past inflection def add_past_inflection_of(m): prefix = m.group(1) gender = {"masculine":"m", "male":"m", "feminine":"f", "female":"f", "neuter":"n", "neutral":"n", "plural":"p"}[m.group(2).lower()] lemma = m.group(3) retval = prefix + "{{inflection of|lang=ru|%s||%s%s|past|ind}}" % (lemma, gender, gender != "p" and "|s" or "") pagemsg("Replaced <%s> with %s" % (m.group(0), retval)) notes.append("converted raw to 'inflection of' for %s%s/past/ind" % (gender, gender != "p" and "/s" or "")) return retval newsec = re.sub(r"(# |\()'*(male|masculine|female|feminine|neutral|neuter|plural) (?:singular |)past (?:tense |form |)of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_past_inflection_of, sections[j], 0, re.I) newsec = re.sub(r"(# |\()'*past(?:-tense| tense|) (male|masculine|female|feminine|neutral|neuter|plural) (?:singular |)(?:form |)of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_past_inflection_of, newsec, 0, re.I) sections[j] = newsec # Try to add 'inflection of' to raw-specified imperative inflection def add_imper_inflection_of(m): prefix = m.group(1) number = {"singular":"s", "plural":"p"}[m.group(2).lower()] lemma = m.group(3) retval = prefix + "{{inflection of|lang=ru|%s||2|%s|imp}}" % (lemma, number) pagemsg("Replaced <%s> with %s" % (m.group(0), retval)) notes.append("converted raw to 'inflection of' for 2/%s/imp" % number) return retval newsec = re.sub(r"(# |\()'*(singular|plural) imperative (?:form |)of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_imper_inflection_of, sections[j], 0, re.I) newsec = re.sub(r"(# |\()'*imperative (singular|plural) (?:form |)of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_imper_inflection_of, newsec, 0, re.I) sections[j] = newsec # Try to add 'inflection of' to raw-specified finite pres/fut inflection def add_pres_fut_inflection_of(m): prefix = m.group(1) person = m.group(2)[0] number = {"singular":"s", "plural":"p"}[m.group(3).lower()] tense = {"present":"pres", "future":"fut"}[m.group(4).lower()] lemma = m.group(5) retval = prefix + "{{inflection of|lang=ru|%s||%s|%s|%s|ind}}" % (lemma, person, number, tense) pagemsg("Replaced <%s> with %s" % (m.group(0), retval)) notes.append("converted raw to 'inflection of' for %s/%s/%s/ind" % (person, number, tense)) return retval newsec = re.sub(r"(# |\()'*(1st|2nd|3rd)(?:-person| person|) (singular|plural) (present|future) (?:tense |)of'* '*(?:\[\[|\{\{[lm]\|ru\||\{\{term\|)([^|]*?)(?:\]\]|\}\}|\|+lang=ru\}\})'*", add_pres_fut_inflection_of, sections[j], 0, re.I) sections[j] = newsec new_text = "".join(sections) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(blib.group_notes(notes)) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def create_declension(page, index, save, pos, tempname, decltempname, sgnum, removeparams, is_proper=False): pagename = page.title() comments = [] def pgmsg(text): msg("Page %s %s: %s" % (index, pagename, text)) # Starts with definite article al- def starts_with_al(text): return re.match(ALIF_ANY + A + "?" + L, text) def sub_if(fr, to, text): if re.search(fr, text): return re.sub(fr, to, text) else: return "" # Remove definite article al- from text def remove_al(text): return (sub_if("^" + ALIF_ANY + A + "?" + L + SK + "?(.)" + SH, r"\1", text) or sub_if("^" + ALIF_ANY + A + "?" + L + SK + "?", "", text) or text) # Remove definite article al- from transliterated text def remove_al_tr(text): return (sub_if(ur"^a?([sšṣtṯṭdḏḍzžẓnrḷ])-\1", r"\1", text) or sub_if("^a?l-", "", text) or text) # Split off interwiki links at end m = re.match(r"^(.*?\n+)((\[\[[a-z0-9_\-]+:[^\]]+\]\]\n*)*)$", page.text, re.S) if m: pagebody = m.group(1) pagetail = m.group(2) else: pagebody = page.text pagetail = "" # Split top-level sections (by language) splitsections = re.split("(^==[^=\n]+==\n)", pagebody, 0, re.M) # Extract off head and recombine section headers with following text pagehead = splitsections[0] sections = [] for i in xrange(1, len(splitsections)): if (i % 2) == 1: sections.append("") sections[-1] += splitsections[i] # Look for Arabic section for seci in xrange(len(sections)): m = re.match("^==([^=\n]+)==$", sections[seci], re.M) if not m: pgmsg("Can't find language name in text: [[%s]]" % (sections[seci])) elif m.group(1) == "Arabic": # Extract off trailing separator mm = re.match(r"^(.*?\n+)(--+\n*)$", sections[seci], re.S) if mm: secbody = mm.group(1) sectail = mm.group(2) else: secbody = sections[seci] sectail = "" # Split into subsections based on headers subsections = re.split("(^===+[^=\n]+===+\n)", secbody, 0, re.M) # Go through each subsection for j in xrange(len(subsections)): notes = [] def add_note(note): if note not in notes: notes.append(note) # Look for subsections matching the given POS if j > 0 and (j % 2) == 0 and re.match("^===+%s===+\n" % pos, subsections[j - 1]): # Call reorder_shadda here so the templates we work with have # shadda in correct order but we don't mess with other text to # avoid unnecessary saving parsed = blib.parse_text(reorder_shadda(subsections[j])) def pagemsg(text): pgmsg("%s: [[%s]]" % (text, subsections[j])) # Check for various conditions causing us to skip this entry and # not try to add a declension table # Skip declension if certain templates found in definition. # We don't check for {{alternative form of|...}}, because it's # used for e.g. different ways of spelling "camera" in Arabic, # some with -ā and some with -a, so we still want to create # declensions for those. altspelling_templates = [temp for temp in parsed.filter_templates() if temp.name in ["alternative spelling of"]] if len(altspelling_templates) > 0: pagemsg("Alternative spelling redirect found in text, skipping") continue if pos == "Adjective": feminine_of_templates = [temp for temp in parsed.filter_templates() if temp.name in ["feminine of"]] if len(feminine_of_templates) > 0: pagemsg("feminine-of template found for adjective, skipping") continue # Retrieve headword_template, make sure exactly one and it is the right type headword_templates = [temp for temp in parsed.filter_templates() if temp.name in ["ar-noun", "ar-proper noun", "ar-coll-noun", "ar-sing-noun", "ar-noun-pl", "ar-noun-dual", "ar-adj-fem", "ar-adj-pl", "ar-noun-inf-cons", "ar-adj-inf-def", "ar-adj-dual", "ar-adj", "ar-nisba", "ar-noun-nisba", "ar-adj-sound", "ar-adj-in", "ar-adj-an"]] if len(headword_templates) == 0: pagemsg("WARNING: Can't find headword template in text, skipping") continue if len(headword_templates) > 1: pagemsg("WARNING: Found multiple headword templates in text, skipping") continue headword_template = headword_templates[0] if headword_template.name != tempname: pagemsg("Headword template should be '%s' but is '%s', skipping" % (tempname, headword_template.name)) continue def getp(param): return getparam(headword_template, param) # NOTE: We physically add and remove parameters from the headword # template to get the list of parameters to use in creating the # declension template. These changes don't get propagated to the # headword template because we don't convert the parsed text back # to a string. def putp(param, value): addparam(headword_template, param, value) head = getp("1") orighead = head # Check for declension already present if (j + 1 < len(subsections) and re.match("^===+Declension===+\n", subsections[j + 1]) or j + 3 < len(subsections) and re.match("^===+Usage", subsections[j + 1]) and re.match("^===+Declension===+\n", subsections[j + 3]) ): pagemsg("Declension already found for head %s, skipping" % head) continue # Check for cpl # FIXME: Convert cpl into pl and fpl if getp("cpl"): pagemsg("WARNING: Headword template for head %s has cpl param in it, skipping" % (head)) continue # Check for empty head. If w/o explicit translit, skip; else, # fetch head from page title. if not head: if not getp("tr"): pagemsg("WARNING: Headword template head is empty and without explicit translit, skipping") continue else: pagemsg("Headword template head is empty but has explicit translit") add_note("empty head, using page name") head = pagename putp("1", head) # Try to handle cases with a modifier; we can't handle all of them yet headspace = False if ' ' in head: headspace = True words = re.split(r"\s", remove_links(head)) head = words[0] if len(words) > 2: pagemsg("WARNING: Headword template head %s has two or more spaces in it, skipping" % orighead) continue assert(len(words) == 2) # Check for params we don't yet know how to handle must_continue = False for badparam in ["pl2", "pltr", "head2", "sing", "coll"]: if getp(badparam): # FIXME pagemsg("WARNING: Headword template head %s has space in it and param %s, skipping" % (orighead, badparam)) must_continue = True break if must_continue: continue # Now check for various types of construction, all either # construct (ʾidāfa) or adjectival def remove_nom_gen_i3rab(word, nomgen, undia, undiatext, udia, udiatext): if word.endswith(undia): pagemsg("Removing %s i3rab (%s) from %s" % (nomgen, undiatext, word)) add_note("removing %s i3rab (%s)" % (nomgen, undiatext)) return re.sub(undia + "$", "", word) if word.endswith(udia): pagemsg("Removing %s i3rab (%s) from %s" % (nomgen, udiatext, word)) add_note("removing %s i3rab (%s)" % (nomgen, udiatext)) return re.sub(udia + "$", "", word) if re.search(DIACRITIC_ANY_BUT_SH + "$", word): pagemsg("WARNING: Strange diacritic at end of %s %s" % (nomgen, word)) if word[0] == ALIF_WASLA: pagemsg("Changing %s alif wasla to plain alif for %s" % (nomgen, word)) add_note("changing %s alif wasla to plain alif" % (nomgen)) word = ALIF + word[1:] return word def remove_gen_i3rab(word): return remove_nom_gen_i3rab(word, "genitive", IN, "IN", I, "I") def remove_nom_i3rab(word): return remove_nom_gen_i3rab(word, "nominative", UN, "UN", U, "U") def remove_gen_i3rab_tr(word): return remove_nom_gen_i3rab(word, "genitive", "in", "in", "i", "i") def remove_nom_i3rab_tr(word): return remove_nom_gen_i3rab(word, "nominative", "un", "un", "u", "u") idafa = False word0al = starts_with_al(words[0]) word1al = starts_with_al(words[1]) words[0] = remove_al(words[0]) words[1] = remove_al(words[1]) putp("1", words[0]) putp("mod", words[1]) if word0al and word1al: pagemsg("Headword template head %s has space in it and found definite adjective construction" % (orighead)) add_note("modifier definite adjective construction") putp("state", "def") elif word0al and not word1al: pagemsg("WARNING: Headword template head %s has space in it and found al-X + Y construction, can't handle, skipping" % (orighead)) continue elif is_proper: if words[0].endswith(ALIF) and word1al: pagemsg("Proper noun headword template head %s has space in it and found ind-def with definite adjectival modifier" % (orighead)) add_note("modifier proper noun + definite adjective construction") putp("state", "ind-def") elif remove_diacritics(words[0]) == u"جمهورية": if word1al: pagemsg("Proper noun headword template head %s has space in it and found definite idafa" % (orighead)) add_note("modifier definite idafa construction") idafa = True assert sgnum == "sg" idafaval = "def" putp("idafa", idafaval) elif words[1].endswith(ALIF): pagemsg("Proper noun headword template head %s has space in it and found idafa with ind-def modifier" % (orighead)) add_note("modifier proper-noun ind-def idafa construction") assert sgnum == "sg" idafaval = "ind-def" putp("idafa", idafaval) else: pagemsg("WARNING: Proper noun headword template head %s has space in it and found idafa construction we can't handle, skipping" % (orighead)) continue else: pagemsg("WARNING: Proper noun headword template head %s has space in it and can't determine whether idafa, skipping" % (orighead)) continue elif not word0al and word1al: # Found an ʾidāfa construction pagemsg("Headword template head %s has space in it and found definite idafa" % (orighead)) add_note("modifier definite idafa construction") idafa = True idafaval = "def-" + sgnum if idafaval == "def-sg": idafaval = "def" putp("idafa", idafaval) elif words[1].endswith(I + Y): pagemsg("WARNING: Headword template head %s has space in it and appears to end in badly formatted nisba, FIXME, skipping" % (orighead)) continue elif words[1].endswith(I + Y + SH): pagemsg("Headword template head %s has space in it and found indefinite adjective nisba construction" % (orighead)) add_note("modifier indefinite nisba adjective construction") elif pagename in adjectival_phrases: pagemsg("Headword template head %s has space in it, indefinite, and manually specified to be adjectival" % (orighead)) add_note("modifier indefinite adjective construction") else: pagemsg("Headword template head %s has space in it, indefinite, and not specified to be adjectival, assuming idafa" % (orighead)) add_note("modifier indefinite idafa construction") idafa = True putp("idafa", sgnum) # Now remove any i3rab diacritics putp("1", remove_nom_i3rab(getp("1"))) if idafa: putp("mod", remove_gen_i3rab(getp("mod"))) else: putp("mod", remove_nom_i3rab(getp("mod"))) # Now check if the lemma is plural if re.match(r"\bp\b", getp("2")): pagemsg("Headword template head %s has space in it and is plural" % (orighead)) add_note("plural lemma") if getp("tr"): # FIXME (doesn't occur though) pagemsg("WARNING: Headword template head %s has space in it and manual translit and is plural, skipping" % (orighead)) continue putp("pl", getp("1")) putp("1", "-") if not idafa: putp("modpl", getp("mod")) putp("mod", "-") # Now check if lemma has plural specified elif getp("pl"): pls = re.split(r"\s", remove_links(getp("pl"))) assert(len(pls) == 2) pls[0] = remove_al(pls[0]) pls[1] = remove_al(pls[1]) putp("pl", remove_nom_i3rab(pls[0])) if not idafa: putp("modpl", remove_nom_i3rab(pls[1])) else: if pls[1] != getp("mod"): pagemsg("FIXME: Headword template head %s, plural modifier %s not same as singular modifier %s in idafa construction" % (orighead, pls[1], getp("mod"))) # Now check if there's manual translit. We need to split the # manual translit in two and pair up manual translit with # corresponding Arabic words. But first remove -t indicating # construct state, and check to see if manual translit is # same as auto translit, in which case it's unnecessary. if getp("tr"): pagemsg("Headword template head %s has space in it and manual translit" % (orighead)) trwords = re.split(r"\s", getp("tr")) assert(len(trwords) == 2) trwords[0] = remove_nom_i3rab_tr(remove_al_tr(trwords[0])) if idafa: trwords[1] = remove_gen_i3rab_tr(remove_al_tr(trwords[1])) else: trwords[1] = remove_nom_i3rab_tr(remove_al_tr(trwords[1])) # Remove any extraneous -t from translit, either from construct # state of from removal of i3rab in a feminine noun/adj. for i in [0, 1]: if words[i].endswith(TAM) and trwords[i].endswith("t"): trwords[i] = trwords[i][0:-1] if words[i].endswith(ALIF + TAM) and not trwords[i].endswith("h"): trwords[i] += "h" if ar_translit.tr(words[0]) != trwords[0]: pagemsg("Headword template head %s has space in it and manual translit %s which is different from auto-translit of %s" % (orighead, trwords[0], words[0])) add_note("modified head w/manual translit") putp("1", "%s/%s" % (getp("1"), trwords[0])) else: pagemsg("Headword template head %s has space in it and manual translit %s which is same as auto-translit of %s" % (orighead, trwords[0], words[0])) add_note("modified head w/ignored manual translit") if ar_translit.tr(words[1]) != trwords[1]: pagemsg("Headword template head %s has space in it and manual translit %s which is different from auto-translit of %s" % (orighead, trwords[1], words[1])) add_note("modifier w/manual translit") putp("mod", "%s/%s" % (getp("mod"), trwords[1])) else: pagemsg("Headword template head %s has space in it and manual translit %s which is same as auto-translit of %s" % (orighead, trwords[1], words[1])) add_note("modifier w/ignored manual translit") else: # no space in head, not dealing with a modifier # If has link in it, just remove it if '[' in head or ']' in head or '|' in head: pagemsg("Headword template head %s has link in it" % (head)) add_note("removed links from head") head = remove_links(head) putp("1", head) # If starts with definite article, remove article from everything, # including transliterations, and set state=def if starts_with_al(head): pagemsg("Headword template head %s starts with definite article" % (head)) add_note("definite lemma") head = remove_al(head) putp("1", head) putp("state", "def") # Also remove al- from remaining head and pl params def check_for_al(param): param = remove_links(param) value = getparam(headword_template, param) if value: if '[' in value or ']' in value or '|' in value: pagemsg("Param %s value %s has link in it" % (param, value)) add_note("removed links from %s" % param) value = remove_links(value) putp(param, remove_al(value)) params_to_check = ["pl", "sing", "coll", "pauc", "f", "fpl"] for param in params_to_check: check_for_al(param) for i in xrange(2, 10): check_for_al("head%s" % i) for param in params_to_check: check_for_al("%s%s" % (param, i)) # Also remove al- from transliteration def check_for_al_tr(param): value = getparam(headword_template, param) if value: putp(param, remove_al_tr(value)) check_for_al("tr") for param in params_to_check: check_for_al("%str" % param) for i in xrange(2, 10): check_for_al("tr%s" % i) for param in params_to_check: check_for_al("%s%str" % (param, i)) elif is_proper: if head.endswith(ALIF): pagemsg(u"Headword template head %s ends in -ā" % (head)) putp("state", "ind-def") else: pagemsg(u"WARNING: Headword template head %s is indefinite proper noun, not ending in -ā, skipping" % (head)) continue if head.endswith(UN): pagemsg("Headword template head %s ends with explicit i3rab (UN)" % (head)) add_note("head has explicit i3rab (UN)") # We don't continue here because we handle this case below elif head.endswith(U): pagemsg("Headword template head %s ends with explicit i3rab (U)" % (head)) add_note("head has explicit i3rab (U)") # We don't continue here because we don't need to handle this case # Now check if the lemma is plural if re.match(r"\bp\b", getp("2")): pagemsg("Headword template head %s is plural" % (head)) add_note("plural lemma") if getp("tr"): # FIXME (doesn't occur though) pagemsg("WARNING: Headword template head %s has manual translit and is plural, skipping" % (head)) continue putp("pl", getp("1")) putp("1", "-") # Now fetch the parameters from the headword template, removing # any that we want to remove, removing the i3rab -UN ending, and # adding any specified manual translit as a / annotation. def param_should_be_removed(param): name = unicode(param.name) if name == "sc" and unicode(param.value) == "Arab": return True if name.endswith("tr"): return True for remove in removeparams: if name == remove: return True if re.match("^[a-z]+$", remove) and re.match("^%s([0-9]+)?$" % remove, name): return True return False def remove_i3rab(param): text = unicode(param) if text.endswith(UN): pgmsg("Removing i3rab from %s: %s" % (text, unicode(headword_template))) add_note("removing i3rab") return re.sub(UN + "$", "", text) def trparam(name): if name == "1": return "tr" elif name.startswith("head"): return name.replace("head", "tr") else: return name + "tr" def process_param(param): arabic = remove_i3rab(param) # Value of + is used in ar-nisba, ar-noun-nisba, ar-adj-in # to signal the strong plural. if arabic.endswith("=+"): newarabic = re.sub(r"=\+$", "=sp", arabic) pgmsg("Converting %s to %s: %s" % (arabic, newarabic, unicode(headword_template))) arabic = newarabic # Value of - is used in ar-adj-in to signal an unknown # feminine plural. if arabic.endswith("=-"): newarabic = re.sub(r"=-$", "=?", arabic) pgmsg("Converting %s to %s: %s" % (arabic, newarabic, unicode(headword_template))) arabic = newarabic # Don't process translit in modifier constructions, where the # translit is also processed. if not headspace: tr = getparam(headword_template, trparam(unicode(param.name))) if tr: return arabic + "/" + tr return arabic params = '|'.join([process_param(param) for param in headword_template.params if not param_should_be_removed(param)]) # For templates that automatically supply the masculine plural, # supply it here, too if not overridden. if tempname in ["ar-nisba", "ar-noun-nisba", "ar-adj-sound", "ar-adj-an"] and not getp("pl"): params += '|pl=sp' # Separate off any [[Category: Foo]] declarators, insert before them m = re.match(r"^(.*?\n+)((\[\[[A-Za-z0-9_\-]+:[^\]]+\]\]\n*)*)$", subsections[j], re.S) if m: body = m.group(1) tail = m.group(2) else: body = subsections[j] tail = "" # Make sure there are two trailing newlines if body.endswith("\n\n"): pass elif body.endswith("\n"): body += "\n" else: body += "\n\n" body += (subsections[j - 1].replace(pos, "=Declension=") + "{{%s|%s}}\n\n" % (decltempname, params)) subsections[j] = body + tail comment = "added declension for %s %s" % (tempname, remove_links(orighead) or "%s/%s" % (pagename, getp("tr"))) note = ', '.join(notes) if note: comment = "%s (%s)" % (comment, note) comments.append(comment) sections[seci] = ''.join(subsections) + sectail newtext = pagehead + ''.join(sections) + pagetail comment = '; '.join(comments) assert((not comment) == (newtext == page.text)) if newtext != page.text: if verbose: msg("Replacing [[%s]] with [[%s]]" % (page.text, newtext)) page.text = newtext msg("For page %s, comment = %s" % (pagename, comment)) if save: page.save(comment = comment)
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) pagemsg("Processing") manual_ppp_forms = ["past_pasv_part", "past_pasv_part2", "past_pasv_part3", "past_pasv_part4", "ppp", "ppp2", "ppp3", "ppp4"] text = unicode(page.text) parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): origt = unicode(t) tname = unicode(t.name) if tname == "ru-conj": manual_ppps = [] for form in manual_ppp_forms: ppp = getparam(t, form) if ppp and ppp != "-": manual_ppps.append(ppp) if not manual_ppps: continue if [x for x in t.params if unicode(x.value) == "or"]: pagemsg("WARNING: Skipping multi-arg conjugation: %s" % unicode(t)) continue curvariant = getparam(t, "2") if "+p" in curvariant or "(7)" in curvariant or "(8)" in curvariant: pagemsg("WARNING: Found both manual PPP and PPP variant, something wrong: %s" % unicode(t)) continue t2 = blib.parse_text(unicode(t)).filter_templates()[0] for form in manual_ppp_forms: rmparam(t2, form) variants_to_try = ["+p"] if u"ё" in re.sub(u"ённый$", "", manual_ppps[0]): variants_to_try.append(u"+pё") if u"жденный" in manual_ppps[0] or u"ждённый" in manual_ppps[0]: variants_to_try.append(u"+pжд") notsamemsgs = [] for variant in variants_to_try: t2.add("2", curvariant + variant) tempcall = re.sub(r"\{\{ru-conj", "{{ru-generate-verb-forms", unicode(t2)) result = expand_text(tempcall) if not result: pagemsg("WARNING: Error generating forms, skipping") continue args = rulib.split_generate_args(result) if "past_pasv_part" not in args: pagemsg("WARNING: Something wrong, no past passive participle generated: %s" % unicode(t)) continue auto_ppps = [] for form in manual_ppp_forms: if form in args: for ppp in re.split(",", args[form]): if ppp and ppp != "-": auto_ppps.append(ppp) if manual_ppps == auto_ppps: pagemsg("Manual PPP's %s same as auto-generated PPP's, switching to auto" % ",".join(manual_ppps)) for form in manual_ppp_forms: rmparam(t, form) t.add("2", curvariant + variant) notes.append("replaced manual PPP's with variant %s" % variant) break else: notsamemsgs.append("WARNING: Manual PPP's %s not same as auto-generated PPP's %s: %s" % (",".join(manual_ppps), ",".join(auto_ppps), unicode(t))) else: # no break in for loop for m in notsamemsgs: pagemsg(m) newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) new_text = unicode(parsed) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(notes) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") if not page.exists(): pagemsg("WARNING: Page doesn't exist") return if ":" in pagetitle and not re.search( "^(Citations|Appendix|Reconstruction|Transwiki|Talk|Wiktionary|[A-Za-z]+ talk):", pagetitle): pagemsg("WARNING: Colon in page title and not a recognized namespace to include, skipping page") return text = unicode(page.text) notes = [] subsections = re.split("(^==.*==\n)", text, 0, re.M) newtext = text def move_param(t, fr, to, frob_from=None): if t.has(fr): oldval = getparam(t, fr) if not oldval.strip(): rmparam(t, fr) pagemsg("Removing blank param %s" % fr) return if frob_from: newval = frob_from(oldval) if not newval or not newval.strip(): return else: newval = oldval if getparam(t, to).strip(): pagemsg("WARNING: Would replace %s= -> %s= but %s= is already present: %s" % (fr, to, to, unicode(t))) elif oldval != newval: rmparam(t, to) # in case of blank param # If either old or new name is a number, use remove/add to automatically set the # showkey value properly; else it's safe to just change the name of the param, # which will preserve its location. if re.search("^[0-9]+$", fr) or re.search("^[0-9]+$", to): rmparam(t, fr) t.add(to, newval) else: tfr = t.get(fr) tfr.name = to tfr.value = newval pagemsg("%s=%s -> %s=%s" % (fr, oldval.replace("\n", r"\n"), to, newval.replace("\n", r"\n"))) else: rmparam(t, to) # in case of blank param # See comment above. if re.search("^[0-9]+$", fr) or re.search("^[0-9]+$", to): rmparam(t, fr) t.add(to, newval) else: t.get(fr).name = to pagemsg("%s -> %s" % (fr, to)) def fix_page_params(t): origt = unicode(t) for param in ["page", "pages"]: pageval = getparam(t, param) if re.search(r"^\s*pp?\.\s*", pageval): pageval = re.sub(r"^(\s*)pp?\.\s*", r"\1", pageval) t.add(param, pageval) notes.append("remove p(p). from %s=" % param) pagemsg("remove p(p). from %s=" % param) if re.search(r"^[0-9]+$", getparam(t, "pages").strip()): move_param(t, "pages", "page") if re.search(r"^[0-9]+[-–—]$", getparam(t, "page").strip()): move_param(t, "page", "pages") return origt != unicode(t) def fix_cite_book_params(t): origt = unicode(t) if getparam(t, "origyear").strip() and getparam(t, "year").strip(): if getparam(t, "year_published"): pagemsg("WARNING: Would set year_published= but is already present: %s" % unicode(t)) else: rmparam(t, "year_published") # in case of blank param t.get("year").name = "year_published" t.get("origyear").name = "year" pagemsg("year -> year_published, origyear -> year") move_param(t, "origdate", "date") move_param(t, "origmonth", "month") def frob_isbn(idval): isbn_re = r"^(\s*)(10-ISBN +|ISBN-13 +|ISBN:? +|ISBN[-=] *)" if re.search(isbn_re, idval, re.I): return re.sub(isbn_re, r"\1", idval, 0, re.I) elif re.search(r"^[0-9]", idval.strip()): return idval else: pagemsg("WARNING: Would replace id= -> isbn= but id=%s doesn't begin with 'ISBN '" % idval.replace("\n", r"\n")) return None move_param(t, "id", "isbn", frob_isbn) fix_page_params(t) return origt != unicode(t) def fix_cite_usenet_params(t): origt = unicode(t) move_param(t, "group", "newsgroup") move_param(t, "link", "url") return origt != unicode(t) def fix_quote_usenet_params(t): origt = unicode(t) monthday = getparam(t, "monthday").strip() year = getparam(t, "year").strip() if monthday and year: if getparam(t, "date"): pagemsg("WARNING: Would set date= but is already present: %s" % unicode(t)) else: rmparam(t, "date") # in case of blank param param = t.get("monthday") param.name = "date" if re.search("^[0-9]+/[0-9]+$", monthday): param.value = "%s/%s" % (monthday, year) else: param.value = "%s %s" % (monthday, year) rmparam(t, "year") pagemsg("monthday/year -> date") move_param(t, "group", "newsgroup") move_param(t, "text", "passage") move_param(t, "6", "passage") move_param(t, "5", "url") move_param(t, "4", "newsgroup") move_param(t, "3", "title") move_param(t, "2", "author") move_param(t, "1", "date") return origt != unicode(t) def replace_in_reference(parsed, in_what): for t in parsed.filter_templates(): tname = unicode(t.name) origt = unicode(t) if tname.strip() in ["reference-journal", "reference-news"]: set_template_name(t, "cite-journal", tname) pagemsg("%s -> cite-journal" % tname.strip()) notes.append("%s -> cite-journal" % tname.strip()) fix_page_params(t) pagemsg("Replacing %s with %s in %s" % (origt, unicode(t), in_what)) if tname.strip() == "reference-book": set_template_name(t, "cite-book", tname) pagemsg("reference-book -> cite-book") fixed_params = fix_cite_book_params(t) notes.append("reference-book -> cite-book%s" % ( fixed_params and " and fix book cite params" or "")) pagemsg("Replacing %s with %s in %s" % (origt, unicode(t), in_what)) for j in xrange(0, len(subsections), 2): parsed = blib.parse_text(subsections[j]) if j > 0 and re.search(r"^===*References===*\n", subsections[j-1]): replace_in_reference(parsed, "==References== section") subsections[j] = unicode(parsed) else: for t in parsed.filter_tags(): if unicode(t.tag) == "ref": tagparsed = mw.wikicode.Wikicode([t]) replace_in_reference(tagparsed, "<ref>") subsections[j] = unicode(parsed) need_to_replace_double_quote_prefixes = False for t in parsed.filter_templates(): tname = unicode(t.name) origt = unicode(t) for fr, to in simple_replace: if tname.strip() == fr: set_template_name(t, to, tname) pagemsg("%s -> %s" % (fr, to)) notes.append("%s -> %s" % (fr, to)) fix_page_params(t) pagemsg("Replacing %s with %s" % (origt, unicode(t))) if tname.strip() in ["reference-journal", "reference-news"]: set_template_name(t, "quote-journal", tname) pagemsg("%s -> quote-journal" % tname.strip()) notes.append("%s -> quote-journal" % tname.strip()) fix_page_params(t) pagemsg("Replacing %s with %s outside of reference section" % (origt, unicode(t))) if tname.strip() == "reference-book": set_template_name(t, "quote-book", tname) pagemsg("reference-book -> cite-book") fixed_params = fix_cite_book_params(t) notes.append("reference-book -> cite-book%s" % ( fixed_params and " and fix book cite params" or "")) pagemsg("Replacing %s with %s outside of reference section" % (origt, unicode(t))) if tname.strip() in ["cite-usenet", "quote-usenet"]: if tname.strip() == "cite-usenet": fixed_params = fix_cite_usenet_params(t) else: fixed_params = fix_quote_usenet_params(t) set_template_name(t, "quote-newsgroup", tname) pagemsg("%s -> quote-newsgroup" % tname.strip()) prefix = getparam(t, "prefix").strip() removed_prefix = False if prefix: if prefix in ["#", "#*"]: parsed.insert_before(t, "#* ") rmparam(t, "prefix") pagemsg("remove prefix=%s, insert #* before template" % prefix) need_to_replace_double_quote_prefixes = True removed_prefix = True else: pagemsg("WARNING: Found prefix=%s, not # or #*: %s" % (prefix, unicode(t))) notes.append("%s -> quote-newsgroup%s%s" % (tname.strip(), removed_prefix and ", remove prefix=%s, insert #* before template" % prefix or "", fixed_params and ", fix params" or "")) pagemsg("Replacing %s with %s" % (origt, unicode(t))) subsections[j] = unicode(parsed) if need_to_replace_double_quote_prefixes: newval = re.sub("^#\* #\* ", "#* ", subsections[j], 0, re.M) if newval != subsections[j]: notes.append("remove double #* prefix") pagemsg("Removed double #* prefix") subsections[j] = newval newtext = "".join(subsections) if text != newtext: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, newtext)) assert notes comment = "; ".join(blib.group_notes(notes)) if save: pagemsg("Saving with comment = %s" % comment) page.text = newtext page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)