def process_page(index, page, save, verbose): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): origt = unicode(t) if unicode(t.name) == "ru-adj": comps = blib.fetch_param_chain(t, "2", "comp") newcomps = [] for comp in comps: if re.search(u"е́?й$", comp): regcomp = re.sub(u"(е́?)й$", ur"\1е", comp) if regcomp in newcomps: pagemsg("Skipping informal form %s" % comp) notes.append("remove informal comparative %s" % comp) else: pagemsg("WARNING: Found informal form %s without corresponding regular form") newcomps.append(comp) else: newcomps.append(comp) if comps != newcomps: blib.set_param_chain(t, newcomps, "2", "comp") newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) new_text = unicode(parsed) if new_text != text: if verbose: pagemsg("Replacing <%s> with <%s>" % (text, new_text)) assert notes comment = "; ".join(notes) if save: pagemsg("Saving with comment = %s" % comment) page.text = new_text page.save(comment=comment) else: pagemsg("Would save with comment = %s" % comment)
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") notes = [] for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn == "#invoke:form of/templates" and getparam(t, "1") == "template_tags": t.add("1", "tagged_form_of_t") notes.append("Rewrite {{#invoke:form of/templates|template_tags}} with {{#invoke:form of/templates|tagged_form_of_t}}") if tn == "#invoke:form of" and getparam(t, "1") in ["form_of_t", "alt_form_of_t"]: ignorelist = blib.fetch_param_chain(t, "ignorelist", "ignorelist") if ignorelist: ignore = blib.fetch_param_chain(t, "ignore", "ignore") for il in ignorelist: ignore.append(il + ":list") blib.set_param_chain(t, ignore, "ignore", "ignore", before="ignorelist") blib.remove_param_chain(t, "ignorelist", "ignorelist") blib.set_template_name(t, "#invoke:form of/templates") notes.append("Rewrite {{#invoke:form of|%s}} with {{#invoke:form of/templates|form_of_t}}" % getparam(t, "1")) if tn == "#invoke:form of" and getparam(t, "1") == "alt_form_of_t": t.add("2", getparam(t, "text"), before="text") rmparam(t, "text") if t.has("nocap"): rmparam(t, "nocap") else: t.add("withcap", "1") if t.has("nodot"): rmparam(t, "nodot") else: t.add("withdot", "1") t.add("1", "form_of_t") if unicode(t) != origt: pagemsg("Replaced <%s> with <%s>" % (origt, unicode(t))) return unicode(parsed), notes
def combine_verbs(m): verb1 = m.group(1) verb2 = m.group(3) if m.group(2): pagemsg("WARNING: Would combine verbs but found text '%s' needing to go into a note, skipping: %s and %s" % (m.group(2), verb1, verb2)) return m.group(0) t1 = blib.parse_text(verb1).filter_templates()[0] t2 = blib.parse_text(verb2).filter_templates()[0] for t in [t1, t2]: for param in t.params: if not re.search("^[0-9]+$", unicode(param.name)): pagemsg("Verb conjugation has non-numeric args, skipping: %s" % unicode(t)) return m.group(0) params = fetch_numbered_params(t1) params.append("or") newparams = fetch_numbered_params(t2) if len(newparams) < 2: pagemsg("WARNING: Something wrong, no verb type in ru-conj: %s" % unicode(t2)) return m.group(0) vt1 = getparam(t1, "1") vt2 = getparam(t2, "1") if vt1 != vt2: pagemsg("WARNING: Can't combine verbs of different verb types: %s and %s" % (verb1, verb2)) return m.group(0) del newparams[0] params.extend(newparams) blib.set_param_chain(t1, params, "1", "") pagemsg("Combining verb conjugations %s and %s" % ( getparam(t1, "1"), getparam(t2, "1"))) pagemsg("Replaced %s with %s" % (m.group(0).replace("\n", r"\n"), unicode(t1))) notes.append("combined verb conjugations %s and %s" % ( getparam(t1, "1"), getparam(t2, "1"))) return unicode(t1)
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] for t in parsed.filter_templates(): origt = unicode(t) if unicode(t.name) == "ru-adj": comps = blib.fetch_param_chain(t, "2", "comp") newcomps = [] for comp in comps: if re.search(u"е́?й$", comp): regcomp = re.sub(u"(е́?)й$", ur"\1е", comp) if regcomp in newcomps: pagemsg("Skipping informal form %s" % comp) notes.append("remove informal comparative %s" % comp) else: pagemsg( "WARNING: Found informal form %s without corresponding regular form" ) newcomps.append(comp) else: newcomps.append(comp) if comps != newcomps: blib.set_param_chain(t, newcomps, "2", "comp") newt = unicode(t) if origt != newt: pagemsg("Replaced %s with %s" % (origt, newt)) return unicode(parsed), notes
def handle_multiform(firstparam, restparam, form, formtr, declparam=None): if form: form = split_form(form) if declparam: if declparam == "-": declforms = ["-"] else: declforms = split_form(getparam(declt, declparam)) if not form: form = declforms elif set(form) != set(declforms): pagemsg("WARNING: For %s=, headword form(s) %s disagree with decl form(s) %s: headword=%s, decl=%s" % (restparam, ",".join(form), ",".join(declforms), origt, origdeclt)) if form: blib.set_param_chain(t, form, firstparam, restparam) if formtr: trparam = ("" if restparam == "head" else restparam) + "tr" if not form: pagemsg("WARNING: Saw %s=%s but no %s=: %s" % ("trparam", formtr, restparam, origt)) elif len(form) > 1: pagemsg("WARNING: Saw %s=%s and multiple %ss %s: %s" % (trparam, formtr, restparam, ",".join(form), origt)) t.add(trparam, formtr)
def handle_mf(mf, mf_full, make_mf): mfs = blib.fetch_param_chain(t, mf, mf) mfpls = blib.fetch_param_chain(t, mf + "pl", mf + "pl") if mfs and not any(x.startswith("+") for x in mfs): defmf = make_mf(lemma) if set(mfs) == {defmf}: defpls = make_plural(defmf) ok = False if not mfpls or set(mfpls) == set(defpls): ok = True elif set(mfpls) < set(defpls): pagemsg( "WARNING: %pl=%s subset of default=%s, allowing" % (mf, ",".join(mfpls), ",".join(defpls))) ok = True if ok: notes.append( "replace %s=%s with '+' in {{es-noun}}" % (mf, ",".join(mfs))) blib.set_param_chain(t, ["+"], mf, mf) blib.remove_param_chain(t, mf + "pl", mf + "pl") return actual_special = None for special in all_specials: special_mf = make_mf(lemma, special) if special_mf is None: continue if mfs == [special_mf]: pagemsg("Found special=%s with special_mf=%s" % (special, special_mf)) actual_special = special break if actual_special: if not mfpls: pagemsg( "WARNING: Explicit %s=%s matches special=%s but no %s plural" % (mf, ",".join(mfs), actual_special, mf_full)) else: special_mfpl = make_plural(special_mf, actual_special) if special_mfpl: if len(special_mfpl) > 1 and set(mfpls) < set( special_mfpl): pagemsg( "WARNING: for %s=%s and special=%s, %spls=%s subset of special_%spl=%s, allowing" % (mf, ",".join(mfs), actual_special, mf, ",".join(mfpls), mf, ",".join(special_mfpl))) elif set(mfpls) == set(special_mfpl): pagemsg( "Found %s=%s and special=%s, %spls=%s matches special_%spl" % (mf, ",".join(mfs), actual_special, mf, ",".join(mfpls), mf)) else: pagemsg( "WARNING: for %s=%s and special=%s, %spls=%s doesn't match special_%spl=%s" % (mf, ",".join(mfs), actual_special, mf, ",".join(mfpls), mf, ",".join(special_mfpl))) actual_special = None if actual_special: notes.append( "replace explicit %s %s with special indicator '+%s' in {{es-noun}} and remove explicit %s plural" % (mf_full, ",".join(mfs), actual_special, mf_full)) blib.set_param_chain(t, ["+%s" % actual_special], mf, mf) blib.remove_param_chain(t, mf + "pl", mf + "pl") if not actual_special: defmf = make_mf(lemma) mfs_with_def = ["+" if x == defmf else x for x in mfs] if mfs_with_def != mfs: notes.append( "replace default %s %s with '+' in {{es-noun}}" % (mf_full, defmf)) blib.set_param_chain(t, mfs_with_def, mf, mf) if mfpls: defpl = [ x for y in mfs for x in (make_plural(y) or []) ] ok = False if set(defpl) == set(mfpls): ok = True elif len(defpl) > 1 and set(mfpls) < set(defpl): pagemsg( "WARNING: for %s=%s, %spl=%s subset of default pl %s, allowing" % (mf, ",".join(mfs), mf, ",".join(mfpls), ",".join(defpl))) ok = True if ok: pagemsg( "Found %s=%s, %spl=%s matches default pl" % (mf, ",".join(mfs), mf, ",".join(mfpls))) notes.append( "remove redundant explicit %s plural %s in {{es-noun}}" % (mf_full, ",".join(mfpls))) blib.remove_param_chain( t, mf + "pl", mf + "pl") else: for special in all_specials: defpl = [ x for y in mfs for x in ( make_plural(y, special) or []) ] if set(defpl) == set(mfpls): pagemsg( "Found %s=%s, %spl=%s matches special=%s" % (mf, ",".join(mfs), mf, ",".join(mfpls), special)) notes.append( "replace explicit %s plural %s with special indicator '+%s' in {{es-noun}}" % (mf_full, ",".join(mfpls), special)) blib.set_param_chain( t, ["+%s" % special], mf + "pl", mf + "pl")
def process_page_section(index, page, section, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) if not page.exists(): pagemsg("WARNING: Page doesn't exist, skipping") return None parsed = blib.parse_text(section) noun_table_templates = [] noun_old_templates = [] for t in parsed.filter_templates(): if unicode(t.name) == "ru-decl-noun-see": pagemsg("Found ru-decl-noun-see, skipping") return None for t in parsed.filter_templates(): if unicode(t.name) == "ru-noun-table": noun_table_templates.append(t) if unicode(t.name) == "ru-noun-old": noun_old_templates.append(t) if len(noun_table_templates) > 1: pagemsg("WARNING: Found multiple ru-noun-table templates, skipping") return None if len(noun_old_templates) > 1: pagemsg("WARNING: Found multiple ru-noun-old templates, skipping") return None if len(noun_table_templates) < 1: if noun_old_templates: pagemsg("WARNING: No ru-noun-table templates but found ru-noun-old template(s): %s" % ", ".join(unicode(x) for x in noun_old_templates)) return unicode(parsed), 0, 0, 0, 0 for t in parsed.filter_templates(): if unicode(t.name) in ["ru-noun", "ru-proper noun"]: pagemsg("Found ru-noun or ru-proper noun, skipping") return None headword_templates = [] for t in parsed.filter_templates(): if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]: headword_templates.append(t) if len(headword_templates) > 1: pagemsg("WARNING: Found multiple headword templates, skipping") return None if len(headword_templates) < 1: return unicode(parsed), 0, 0, 0, 0 noun_table_template = noun_table_templates[0] noun_old_template = noun_old_templates[0] if len(noun_old_templates) == 1 else None headword_template = headword_templates[0] decl_templates = [x for x in [noun_table_template, noun_old_template] if x] if verbose: pagemsg("Found headword template: %s" % unicode(headword_template)) pagemsg("Found decl template: %s" % unicode(noun_table_template)) if noun_old_template: pagemsg("Found old decl template: %s" % unicode(noun_old_template)) orig_headword_template = unicode(headword_template) orig_noun_table_template = unicode(noun_table_template) genders = blib.fetch_param_chain(headword_template, "g", "g") masculines = blib.fetch_param_chain(headword_template, "m", "m") feminines = blib.fetch_param_chain(headword_template, "f", "f") notrcat = getparam(headword_template, "notrcat") filtered_headword_params = [] for param in headword_template.params: name = unicode(param.name) if re.search("^[gmf][0-9]*$", name) or name == "notrcat": pass else: filtered_headword_params.append((param.name, param.value)) filtered_headword_template = blib.parse_text("{{ru-noun+}}").filter_templates()[0] for name, value in filtered_headword_params: filtered_headword_template.add(name, value) ru_noun_table_cleaned = 0 ru_noun_table_link_copied = 0 ru_noun_changed = 0 ru_proper_noun_changed = 0 new_decl_params = [] for param in noun_table_template.params: name = unicode(param.name) if re.search("^[gmf][0-9]*$", name): pagemsg("WARNING: Found g=, m= or f= in noun-table, removing: %s" % unicode(noun_table_template)) else: new_decl_params.append((param.name, param.value)) del noun_table_template.params[:] for name, value in new_decl_params: noun_table_template.add(name, value) if orig_noun_table_template != unicode(noun_table_template): ru_noun_table_cleaned = 1 modified_noun_table_template = blib.parse_text("{{ru-noun-table}}").filter_templates()[0] for param in noun_table_template.params: modified_noun_table_template.add(param.name, param.value) # If proper noun and n is both then we need to add n=both because # proper noun+ defaults to n=sg if unicode(headword_template.name) == "ru-proper noun+": generate_template = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args", unicode(noun_table_template)) generate_result = expand_text(generate_template) if not generate_result: pagemsg("WARNING: Error generating noun args, skipping") return None args = ru.split_generate_args(generate_result) # If proper noun and n is both then we need to add n=both because # proper noun+ defaults to n=sg if args["n"] == "b" and not getparam(modified_noun_table_template, "n"): pagemsg("Adding n=both to headword template") modified_noun_table_template.add("n", "both") # Correspondingly, if n is sg then we can usually remove n=sg; # but we need to check that the number is actually sg with n=sg # removed because of the possibility of plurale tantum lemmas if args["n"] == "s": generate_template_with_ndef = generate_template.replace("}}", "|ndef=sg}}") generate_template_with_ndef = re.sub(r"\|n=s[^=|{}]*", "", generate_template_with_ndef) generate_result = expand_text(generate_template_with_ndef) if not generate_result: pagemsg("WARNING: Error generating noun args, skipping") return None ndef_args = ru.split_generate_args(generate_result) if ndef_args["n"] == "s": existing_n = getparam(headword_template, "n") if existing_n and not re.search(r"^s", existing_n): pagemsg("WARNING: Something wrong: Found n=%s, not singular" % existing_n) pagemsg("Removing n=sg from headword template") rmparam(modified_noun_table_template, "n") else: pagemsg("WARNING: Unable to remove n= from headword template because n=%s" % ndef_args["n"]) new_headword_template = re.sub(r"^\{\{ru-noun-table", "{{ru-noun+", unicode(modified_noun_table_template)) existing_filtered_headword_template = unicode(filtered_headword_template) change_existing_headword = False if existing_filtered_headword_template != new_headword_template: if "[" in existing_filtered_headword_template and "[" not in new_headword_template: if blib.remove_links(existing_filtered_headword_template) == new_headword_template: pagemsg("Headword has links but decl doesn't and they're otherwise the same, copying headword to decl") del noun_table_template.params[:] for param in filtered_headword_template.params: noun_table_template.add(param.name, param.value) ru_noun_table_link_copied = 1 ru_noun_table_cleaned = 0 else: pagemsg("WARNING: Existing headword template %s would be overwritten with %s but links would be erased, not doing it, check manually" % (existing_filtered_headword_template, new_headword_template)) return None else: pagemsg("WARNING: Existing headword template %s will be overwritten with %s" % (existing_filtered_headword_template, new_headword_template)) change_existing_headword = True if change_existing_headword and (not lemmas or pagetitle in lemmas): del headword_template.params[:] for param in modified_noun_table_template.params: headword_template.add(param.name, param.value) blib.set_param_chain(headword_template, genders, "g", "g") blib.set_param_chain(headword_template, masculines, "m", "m") blib.set_param_chain(headword_template, feminines, "f", "f") if notrcat: headword_template.add("notrcat", notrcat) #genders = runoun.check_old_noun_headword_forms(headword_template, args, # subpagetitle, pagemsg) #if genders == None: # return None #new_params = [] #for param in noun_table_template.params: # new_params.append((param.name, param.value)) #params_to_preserve = runoun.fix_old_headword_params(headword_template, # new_params, genders, pagemsg) #if params_to_preserve == None: # return None new_noun_table_template = unicode(noun_table_template) if new_noun_table_template != orig_noun_table_template: pagemsg("Replacing noun table %s with %s" % (orig_noun_table_template, new_noun_table_template)) new_headword_template = unicode(headword_template) if new_headword_template != orig_headword_template: pagemsg("Replacing headword %s with %s" % (orig_headword_template, new_headword_template)) if unicode(headword_template.name) == "ru-noun+": ru_noun_changed = 1 else: ru_proper_noun_changed = 1 return unicode(parsed), ru_noun_table_cleaned, ru_noun_table_link_copied, ru_noun_changed, ru_proper_noun_changed
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] if old_adj_template not in text and "es-noun" not in text: return if ":" in pagetitle: pagemsg("Skipping non-mainspace title") return pagemsg("Processing") parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) if tn == "es-noun" and args.remove_redundant_noun_args: origt = unicode(t) lemma = blib.remove_links(getparam(t, "head") or pagetitle) if not getparam(t, "2") and (getparam(t, "pl2") or getparam(t, "pl3")): pagemsg("WARNING: Saw pl2= or pl3= without 2=: %s" % unicode(t)) continue g = getparam(t, "1") ms = blib.fetch_param_chain(t, "m", "m") space_in_m = False for m in ms: if " " in m: space_in_m = True mpls = blib.fetch_param_chain(t, "mpl", "mpl") if space_in_m and not mpls and not g.endswith("-p"): pagemsg( "WARNING: Space in m=%s and old default noun algorithm applying" % ",".join(ms)) fs = blib.fetch_param_chain(t, "f", "f") space_in_f = False for f in fs: if " " in f: space_in_f = True fpls = blib.fetch_param_chain(t, "fpl", "fpl") if space_in_f and not fpls and not g.endswith("-p"): pagemsg( "WARNING: Space in f=%s and old default noun algorithm applying" % ",".join(fs)) pls = blib.fetch_param_chain(t, "2", "pl") if not pls and not g.endswith("-p"): if " " in lemma: pagemsg( "WARNING: Space in headword and old default noun algorithm applying" ) continue pls_with_def = [] defpl = make_plural(lemma) if not defpl: continue if len(defpl) > 1: if set(pls) == set(defpl): pls_with_def = ["+"] elif set(pls) < set(defpl): pagemsg( "WARNING: pls=%s subset of defpls=%s, replacing with default" % (",".join(pls), ",".join(defpl))) pls_with_def = ["+"] else: pls_with_def = pls else: for pl in pls: if pl == defpl[0]: pls_with_def.append("+") else: pls_with_def.append(pl) actual_special = None for special in all_specials: special_pl = make_plural(lemma, special) if special_pl is None: continue if len(special_pl) > 1 and set(pls) < set(special_pl): pagemsg( "WARNING: for special=%s, pls=%s subset of special_pl=%s, allowing" % (special, ",".join(pls), ",".join(special_pl))) actual_special = special break if set(pls) == set(special_pl): pagemsg("Found special=%s with special_pl=%s" % (special, ",".join(special_pl))) actual_special = special break if pls_with_def == ["+"]: notes.append("remove redundant plural%s %s from {{es-noun}}" % ("s" if len(pls) > 1 else "", ",".join(pls))) blib.remove_param_chain(t, "2", "pl") elif actual_special: notes.append("replace plural%s %s with +%s in {{es-noun}}" % ("s" if len(pls) > 1 else "", ",".join(pls), actual_special)) blib.set_param_chain(t, ["+" + actual_special], "2", "pl") elif pls_with_def != pls: notes.append( "replace default plural %s with '+' in {{es-noun}}" % ",".join(defpl)) blib.set_param_chain(t, pls_with_def, "2", "pl") def handle_mf(mf, mf_full, make_mf): mfs = blib.fetch_param_chain(t, mf, mf) mfpls = blib.fetch_param_chain(t, mf + "pl", mf + "pl") if mfs and not any(x.startswith("+") for x in mfs): defmf = make_mf(lemma) if set(mfs) == {defmf}: defpls = make_plural(defmf) ok = False if not mfpls or set(mfpls) == set(defpls): ok = True elif set(mfpls) < set(defpls): pagemsg( "WARNING: %pl=%s subset of default=%s, allowing" % (mf, ",".join(mfpls), ",".join(defpls))) ok = True if ok: notes.append( "replace %s=%s with '+' in {{es-noun}}" % (mf, ",".join(mfs))) blib.set_param_chain(t, ["+"], mf, mf) blib.remove_param_chain(t, mf + "pl", mf + "pl") return actual_special = None for special in all_specials: special_mf = make_mf(lemma, special) if special_mf is None: continue if mfs == [special_mf]: pagemsg("Found special=%s with special_mf=%s" % (special, special_mf)) actual_special = special break if actual_special: if not mfpls: pagemsg( "WARNING: Explicit %s=%s matches special=%s but no %s plural" % (mf, ",".join(mfs), actual_special, mf_full)) else: special_mfpl = make_plural(special_mf, actual_special) if special_mfpl: if len(special_mfpl) > 1 and set(mfpls) < set( special_mfpl): pagemsg( "WARNING: for %s=%s and special=%s, %spls=%s subset of special_%spl=%s, allowing" % (mf, ",".join(mfs), actual_special, mf, ",".join(mfpls), mf, ",".join(special_mfpl))) elif set(mfpls) == set(special_mfpl): pagemsg( "Found %s=%s and special=%s, %spls=%s matches special_%spl" % (mf, ",".join(mfs), actual_special, mf, ",".join(mfpls), mf)) else: pagemsg( "WARNING: for %s=%s and special=%s, %spls=%s doesn't match special_%spl=%s" % (mf, ",".join(mfs), actual_special, mf, ",".join(mfpls), mf, ",".join(special_mfpl))) actual_special = None if actual_special: notes.append( "replace explicit %s %s with special indicator '+%s' in {{es-noun}} and remove explicit %s plural" % (mf_full, ",".join(mfs), actual_special, mf_full)) blib.set_param_chain(t, ["+%s" % actual_special], mf, mf) blib.remove_param_chain(t, mf + "pl", mf + "pl") if not actual_special: defmf = make_mf(lemma) mfs_with_def = ["+" if x == defmf else x for x in mfs] if mfs_with_def != mfs: notes.append( "replace default %s %s with '+' in {{es-noun}}" % (mf_full, defmf)) blib.set_param_chain(t, mfs_with_def, mf, mf) if mfpls: defpl = [ x for y in mfs for x in (make_plural(y) or []) ] ok = False if set(defpl) == set(mfpls): ok = True elif len(defpl) > 1 and set(mfpls) < set(defpl): pagemsg( "WARNING: for %s=%s, %spl=%s subset of default pl %s, allowing" % (mf, ",".join(mfs), mf, ",".join(mfpls), ",".join(defpl))) ok = True if ok: pagemsg( "Found %s=%s, %spl=%s matches default pl" % (mf, ",".join(mfs), mf, ",".join(mfpls))) notes.append( "remove redundant explicit %s plural %s in {{es-noun}}" % (mf_full, ",".join(mfpls))) blib.remove_param_chain( t, mf + "pl", mf + "pl") else: for special in all_specials: defpl = [ x for y in mfs for x in ( make_plural(y, special) or []) ] if set(defpl) == set(mfpls): pagemsg( "Found %s=%s, %spl=%s matches special=%s" % (mf, ",".join(mfs), mf, ",".join(mfpls), special)) notes.append( "replace explicit %s plural %s with special indicator '+%s' in {{es-noun}}" % (mf_full, ",".join(mfpls), special)) blib.set_param_chain( t, ["+%s" % special], mf + "pl", mf + "pl") handle_mf("f", "feminine", make_feminine) handle_mf("m", "masculine", make_masculine) if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) else: pagemsg("No changes to %s" % unicode(t)) if tn == "es-noun" and args.make_multiword_plural_explicit: origt = unicode(t) lemma = blib.remove_links(getparam(t, "head") or pagetitle) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) if " " in lemma and not getparam(t, "2"): g = getparam(t, "1") if not g.endswith("-p"): explicit_pl = expand_text( "{{#invoke:es-headword|make_plural_noun|%s|%s|true}}" % (lemma, g)) if not explicit_pl: pagemsg( "WARNING: Unable to add explicit plural to multiword noun, make_plural_noun returned an empty string" ) continue plurals = explicit_pl.split(",") blib.set_param_chain(t, plurals, "2", "pl") notes.append("add explicit plural to multiword noun") ms = blib.fetch_param_chain(t, "m", "m") space_in_m = False for m in ms: if " " in m: space_in_m = True mpls = blib.fetch_param_chain(t, "mpl", "mpl") if space_in_m and not mpls: mpls = [] for m in ms: explicit_pl = expand_text( "{{#invoke:es-headword|make_plural_noun|%s|m|true}}" % (blib.remove_links(m))) if not explicit_pl: pagemsg( "WARNING: Unable to add explicit plural to m=%s, make_plural_noun returned an empty string" % m) continue this_mpls = explicit_pl.split(",") mpls.extend(this_mpls) blib.set_param_chain(t, mpls, "mpl", "mpl") notes.append("add explicit plural to m=%s" % ",".join(ms)) fs = blib.fetch_param_chain(t, "f", "f") fpls = blib.fetch_param_chain(t, "fpl", "fpl") space_in_f = False for f in fs: if " " in f: space_in_f = True fpls = blib.fetch_param_chain(t, "fpl", "fpl") if space_in_f and not fpls: fpls = [] for f in fs: explicit_pl = expand_text( "{{#invoke:es-headword|make_plural_noun|%s|f|true}}" % (blib.remove_links(f))) if not explicit_pl: pagemsg( "WARNING: Unable to add explicit plural to f=%s, make_plural_noun returned an empty string" % f) continue this_fpls = explicit_pl.split(",") fpls.extend(this_fpls) blib.set_param_chain(t, fpls, "fpl", "fpl") notes.append("add explicit plural to f=%s" % ",".join(fs)) if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) if tn == old_adj_template: origt = unicode(t) lemma = blib.remove_links(getparam(t, "head") or pagetitle) deff = make_feminine(pagetitle) defmpl = make_plural(pagetitle) fs = [] fullfs = [] f = getparam(t, "f") or pagetitle fullfs.append(f) if f == deff: f = "+" elif f == lemma: f = "#" fs.append(f) f2 = getparam(t, "f2") if f2: fullfs.append(f2) if f2 == deff: f2 == "+" fs.append(f2) mpls = [] mpl = getparam(t, "mpl") or getparam(t, "pl") or pagetitle + "s" mpls.append(mpl) mpl2 = getparam(t, "mpl2") or getparam(t, "pl2") if mpl2: mpls.append(mpl2) fullmpls = mpls # should really check for subsequence but it never occurs if set(mpls) == set(defmpl): mpls = ["+"] elif set(mpls) < set(defmpl): pagemsg( "WARNING: mpls=%s subset of defmpl=%s, replacing with default" % (",".join(mpls), ",".join(defmpl))) mpls = ["+"] mpls = ["#" if x == lemma else x for x in mpls] deffpl = [x for f in fullfs for x in make_plural(f)] fpls = [] fpl = getparam(t, "fpl") or getparam( t, "pl") or (getparam(t, "f") or pagetitle) + "s" fpls.append(fpl) fpl2 = getparam(t, "fpl2") or getparam(t, "pl2") if fpl2: fpls.append(fpl2) fullfpls = fpls # should really check for subsequence but it never occurs if set(fpls) == set(deffpl): fpls = ["+"] elif set(fpls) < set(deffpl): pagemsg( "WARNING: fpls=%s subset of deffpl=%s, replacing with default" % (",".join(fpls), ",".join(deffpl))) fpls = ["+"] fpls = ["#" if x == lemma else x for x in fpls] actual_special = None for special in all_specials: deff = make_feminine(pagetitle, special) if deff is None: continue defmpl = make_plural(pagetitle, special) deffpl = make_plural(deff, special) deff = [deff] if fullfs == deff and fullmpls == defmpl and fullfpls == deffpl: actual_special = special break head = getparam(t, "head") must_continue = False for param in t.params: pn = pname(param) pv = unicode(param.value) if pn == "1" and pv in ["m", "mf"]: pagemsg("WARNING: Extraneous param %s=%s in %s, ignoring" % (pn, pv, unicode(t))) continue if pn not in [ "head", "f", "f2", "pl", "pl2", "mpl", "mpl2", "fpl", "fpl2" ]: pagemsg("WARNING: Saw unrecognized param %s=%s in %s" % (pn, pv, unicode(t))) must_continue = True break if must_continue: continue del t.params[:] if head: t.add("head", head) if fullfs == [pagetitle] and fullmpls == [ pagetitle ] and fullfpls == [pagetitle]: blib.set_template_name(t, "es-adj-inv") else: blib.set_template_name(t, "es-adj") if actual_special: t.add("sp", actual_special) else: if fs != ["+"]: blib.set_param_chain(t, fs, "f", "f") if mpls == fpls and ("+" not in mpls or defmpl == deffpl): # masc and fem pl the same if mpls != ["+"]: blib.set_param_chain(t, mpls, "pl", "pl") else: if mpls != ["+"]: blib.set_param_chain(t, mpls, "mpl", "mpl") if fpls != ["+"]: blib.set_param_chain(t, fpls, "fpl", "fpl") if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append("convert {{%s}} to new {{%s}} format" % (old_adj_template, tname(t))) else: pagemsg("No changes to %s" % unicode(t)) return unicode(parsed), notes
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) def getpron(pron): return expand_text("{{#invoke:it-pronunciation|to_phonemic_bot|%s}}" % pron) notes = [] if "it-IPA" not in text: return parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) if tn in ["it-IPA"]: pagemsg("Saw %s" % unicode(t)) default_pron_phonemic = None prons = [] for i in xrange(1, 11): pron = getparam(t, str(i)) if pron: prons.append(pron) if not prons: prons == ["+"] defaulted_prons = [] for pron in prons: def add(prn): if prn not in defaulted_prons: defaulted_prons.append(prn) if pron == "+" or pron == pagetitle: add("+") elif len(pron) == 1: # vowel only add(pron) else: # full pronun pron_phonemic = None if default_pron_phonemic is None: default_pron_phonemic = getpron(pagetitle) if default_pron_phonemic: pron_phonemic = getpron(pron) if not pron_phonemic: add(pron) continue if default_pron_phonemic == pron_phonemic: pron = "+" if pron != "+": if pron_phonemic is None: pron_phonemic = getpron(pron) if not pron_phonemic: add(pron) continue single_vowel_spec = re.sub(u"[^àèéìòúù]", "", pron) if len(single_vowel_spec) == 1: single_vowel_pron_phonemic = getpron( single_vowel_spec) if single_vowel_pron_phonemic == pron_phonemic: pron = single_vowel_spec add(pron) if defaulted_prons == ["+"]: blib.remove_param_chain(t, "1", "") if unicode(t) != origt: notes.append( "remove redundant respelling(s) from {{it-IPA}}") else: blib.set_param_chain(t, defaulted_prons, "1", "") if unicode(t) != origt: notes.append( "replace default respelling(s) with single-vowel spec or '+' in {{it-IPA}}" ) if unicode(t) != origt: pagemsg("Replaced %s with %s" % (origt, unicode(t))) return unicode(parsed), notes
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) pagemsg("Processing") notes = [] text = unicode(page.text) parsed = blib.parse_text(text) for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) if tn == "head" and getparam(t, "1") == "bg" and getparam(t, "2") in [ "noun", "nouns", "proper noun", "proper nouns"]: pos = getparam(t, "2") params = [] for param in t.params: pname = unicode(param.name).strip() pval = unicode(param.value).strip() showkey = param.showkey if (pname not in ["1", "2", "head", "g", "g2", "g3", "3", "4", "5", "6", "7", "8", "9", "10"] or pname == "3" and pval not in ["masculine", "feminine"] or pname in ["5", "7", "9"] and pval != "or"): pagemsg("WARNING: head|bg|%s with extra param %s=%s: %s" % (pos, pname, pval, origt)) break else: # no break rmparam(t, "1") rmparam(t, "2") m = [] f = [] head = getparam(t, "head") rmparam(t, "head") genders = [] def process_gender(g): if g in ["m", "f", "n", "m-p", "f-p", "n-p", "p"]: genders.append(g) else: pagemsg("WARNING: Unrecognized gender '%s'" % g) g = getparam(t, "g") if g: process_gender(g) rmparam(t, "g") g2 = getparam(t, "g2") if g2: process_gender(g2) rmparam(t, "g2") g3 = getparam(t, "g3") if g3: process_gender(g3) rmparam(t, "g3") def handle_mf(array): array.append(getparam(t, "4")) rmparam(t, "3") rmparam(t, "4") i = 5 while getparam(t, str(i)) == "or": array.append(getparam(t, str(i + 1))) rmparam(t, str(i)) rmparam(t, str(i + 1)) i += 2 if getparam(t, "3") == "masculine": handle_mf(m) if getparam(t, "3") == "feminine": handle_mf(f) if pos in ["noun", "nouns"]: newtn = "bg-noun" else: newtn = "bg-proper noun" blib.set_template_name(t, newtn) t.add("1", head or pagetitle) blib.set_param_chain(t, genders, "2", "g") if m: blib.set_param_chain(t, m, "m", "m") if f: blib.set_param_chain(t, f, "f", "f") notes.append("convert {{head|bg|%s}} into {{%s}}" % (pos, newtn)) elif tn in ["bg-noun", "bg-proper noun"]: g = None cur1 = getparam(t, "1") if cur1 in ["m", "f"]: g = cur1 elif re.search("[a-zA-Z]", cur1): pagemsg("WARNING: Saw Latin in 1=%s in %s" % (cur1, origt)) continue head = getparam(t, "head") or getparam(t, "sg") rmparam(t, "head") rmparam(t, "sg") genders = [] def process_gender(g): if g in ["m", "f", "n", "m-p", "f-p", "n-p", "p"]: genders.append(g) elif g in ["mf", "fm"]: genders.append("m") genders.append("f") elif g in ["mn", "nm"]: genders.append("m") genders.append("n") elif g in ["fn", "nf"]: genders.append("f") genders.append("n") elif g in ["mfn", "fmn", "mnf", "nmf", "fnm", "nfm"]: genders.append("m") genders.append("f") genders.append("n") else: pagemsg("WARNING: Unrecognized gender '%s'" % g) if g: process_gender(g) rmparam(t, "1") g = getparam(t, "2") if g: process_gender(g) g = getparam(t, "g") if g: process_gender(g) rmparam(t, "g") g2 = getparam(t, "g2") if g2: process_gender(g2) rmparam(t, "g2") g3 = getparam(t, "g3") if g3: process_gender(g3) rmparam(t, "g3") params = [] for param in t.params: pname = unicode(param.name).strip() pval = unicode(param.value).strip() showkey = param.showkey if not pval: continue params.append((pname, pval, showkey)) # Erase all params. del t.params[:] # Put back new params. t.add("1", rulib.remove_monosyllabic_accents(head or pagetitle)) blib.set_param_chain(t, genders, "2", "g") for pname, pval, showkey in params: t.add(pname, pval, showkey=showkey, preserve_spacing=False) if origt != unicode(t): notes.append("move head=/sg= to 1=, g= to 2= in {{%s}}" % tn) if unicode(t) != origt: pagemsg("Replaced %s with %s" % (origt, unicode(t))) return parsed, notes
def process_page(page, index, adverb): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] pagemsg("Processing") text = unicode(page.text) parsed = blib.parse_text(text) adj_template = None part_template = None for t in parsed.filter_templates(): tn = tname(t) if tn == "la-adj": if adj_template: pagemsg( "WARNING: Saw multiple adjective templates: %s and %s" % (unicode(adj_template), unicode(t))) else: adj_template = t if tn == "la-part": if part_template: pagemsg( "WARNING: Saw multiple participle templates: %s and %s" % (unicode(part_template), unicode(t))) else: part_template = t if adj_template and part_template: pagemsg("Saw both %s and %s, modifying adjective" % (unicode(adj_template), unicode(part_template))) if adj_template: template_to_fix = adj_template elif part_template: template_to_fix = part_template else: pagemsg("WARNING: Didn't see adjective or participle template") return None, None existing_advs = blib.fetch_param_chain(template_to_fix, "adv", "adv") changed = False for i in xrange(len(existing_advs)): if lalib.remove_macrons(existing_advs[i]) == lalib.remove_macrons(adv): if existing_advs[i] != adv: pagemsg("Updating macrons of %s -> %s in %s" % (existing_advs[i], adv, unicode(template_to_fix))) existing_advs[i] = adv changed = True notes.append("update macrons of adv=, changing %s -> %s" % (existing_advs[i], adv)) else: pagemsg("Already saw %s: %s" % (adv, unicode(template_to_fix))) break else: # no break existing_advs.append(adv) changed = True notes.append("add adv %s to adjective" % adv) if changed: origt = unicode(template_to_fix) blib.set_param_chain(template_to_fix, existing_advs, "adv", "adv") pagemsg("Replaced %s with %s" % (origt, unicode(template_to_fix))) return unicode(parsed), notes
def add_rel_adj_or_dim_to_noun_page(nounpage, index, new_adj_or_dims, param, desc): notes = [] pagetitle = unicode(nounpage.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) text = unicode(nounpage.text) retval = blib.find_modifiable_lang_section(text, "Russian", pagemsg) if retval is None: pagemsg("WARNING: Couldn't find Russian section for noun of %s %s" % ( desc, ",".join(new_adj_or_dims))) return sections, j, secbody, sectail, has_non_lang = retval parsed = blib.parse_text(secbody) head = None for t in parsed.filter_templates(): tn = tname(t) if tn in ["ru-noun+", "ru-proper noun+", "ru-noun", "ru-proper noun"]: if head: pagemsg("WARNING: Saw multiple heads %s and %s for noun of %s %s, not modifying" % (unicode(head), unicode(t), desc, ",".join(new_adj_or_dims))) return head = t if not head: pagemsg("WARNING: Couldn't find head for noun of %s %s" % (desc, ",".join(new_adj_or_dims))) return orig_adjs_or_dims = blib.fetch_param_chain(head, param, param) adjs_or_dims = blib.fetch_param_chain(head, param, param) added_adjs_or_dims = [] for adj_or_dim in new_adj_or_dims: if adj_or_dim in adjs_or_dims: pagemsg("Already saw %s %s in head %s" % (desc, adj_or_dim, unicode(head))) else: adjs_or_dims.append(adj_or_dim) added_adjs_or_dims.append(adj_or_dim) if adjs_or_dims != orig_adjs_or_dims: orighead = unicode(head) blib.set_param_chain(head, adjs_or_dims, param, param) pagemsg("Replaced %s with %s" % (orighead, unicode(head))) notes.append("add %s=%s to Russian noun" % (param, ",".join(added_adjs_or_dims))) secbody = unicode(parsed) subsecs = re.split("(^==.*==\n)", secbody, 0, re.M) for k in xrange(2, len(subsecs), 2): if "==Derived terms==" in subsecs[k - 1] or "==Related terms==" in subsecs[k - 1]: header = re.sub("=", "", subsecs[k - 1]).strip() for adj_or_dim in adjs_or_dims: def note_removed_text(m): if m.group(1): pagemsg("Removed '%s' term with gloss for noun of %s %s: %s" % (header, desc, adj_or_dim, m.group(0))) return "" newsubsecsk = re.sub(r"\{\{[lm]\|ru\|%s((?:\|[^{}\n]*)?)\}\}" % adj_or_dim, note_removed_text, subsecs[k]) if newsubsecsk != subsecs[k]: notes.append("remove %s %s from %s" % (desc, adj_or_dim, header)) subsecs[k] = newsubsecsk subsecs[k] = re.sub(", *,", ",", subsecs[k]) # Repeat in case adjacent terms removed (unlikely though). subsecs[k] = re.sub(", *,", ",", subsecs[k]) subsecs[k] = re.sub(" *, *$", "", subsecs[k], 0, re.M) subsecs[k] = re.sub(r"^\* *, *", "* ", subsecs[k], 0, re.M) subsecs[k] = re.sub(r"^\* *(\n|$)", "", subsecs[k], 0, re.M) if re.search(r"^\s*$", subsecs[k]): subsecs[k] = "" subsecs[k - 1] = "" secbody = "".join(subsecs) secj = secbody + sectail newsecj = re.sub(r"\n\n\n+", "\n\n", secj) if newsecj != secj and not notes: notes.append("eliminate sequences of 3 or more newlines") secj = newsecj sections[j] = secj return "".join(sections), notes
def process_text_on_page(index, pagetitle, text): global args if pagetitle.startswith("Reconstruction:Latin/"): pagetitle = re.sub("^Reconstruction:Latin/", "*", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] if not args.stdin: pagemsg("Processing") # Greatly speed things up when --stdin by ignoring non-Latin pages if "==Latin==" not in text: return None, None retval = lalib.find_heads_and_defns(text, pagemsg) if retval is None: return None, None (sections, j, secbody, sectail, has_non_latin, subsections, parsed_subsections, headwords, pronun_sections, etym_sections) = retval for headword in headwords: ht = headword['head_template'] tn = tname(ht) if tn == "la-noun-form" or tn == "head" and getparam( ht, "1") == "la" and getparam(ht, "2") == "noun form": pos = "noun" tag_set_groups = lalib.noun_tag_groups possible_slots = lalib.la_noun_decl_overrides expected_headtemps = ["la-noun"] expected_infltemps = ["la-ndecl"] elif tn == "la-proper noun-form" or tn == "head" and getparam( ht, "1") == "la" and getparam(ht, "2") == "proper noun form": pos = "pn" tag_set_groups = lalib.noun_tag_groups possible_slots = lalib.la_noun_decl_overrides expected_headtemps = ["la-proper noun"] expected_infltemps = ["la-ndecl"] #elif tn == "la-pronoun-form" or tn == "head" and getparam(ht, "1") == "la" and getparam(ht, "2") == "pronoun form": # pos = "pronoun" # tag_set_groups = lalib.adj_tag_groups # possible_slots = lalib.la_adj_decl_overrides # expected_headtemp = ??? elif tn == "la-verb-form" or tn == "head" and getparam( ht, "1") == "la" and getparam(ht, "2") == "verb form": pos = "verb" tag_set_groups = lalib.verb_tag_groups possible_slots = lalib.la_verb_overrides expected_headtemps = ["la-verb"] expected_infltemps = ["la-conj"] elif tn == "la-adj-form" or tn == "head" and getparam( ht, "1") == "la" and getparam(ht, "2") == "adjective form": pos = "adj" tag_set_groups = lalib.adj_tag_groups possible_slots = lalib.la_adj_decl_overrides expected_headtemps = ["la-adj", "la-adj-comp", "la-adj-sup"] expected_infltemps = ["la-adecl"] elif tn == "la-part-form" or tn == "head" and getparam( ht, "1") == "la" and getparam(ht, "2") == "participle form": pos = "part" tag_set_groups = lalib.adj_tag_groups possible_slots = lalib.la_adj_decl_overrides expected_headtemps = ["la-part"] expected_infltemps = ["la-adecl"] #elif tn == "la-suffix-form" or tn == "head" and getparam(ht, "1") == "la" and getparam(ht, "2") == "suffix form": # pos = "suffix" elif tn == "la-num-form" or tn == "head" and getparam( ht, "1") == "la" and getparam(ht, "2") == "numeral form": pos = "numadj" tag_set_groups = lalib.adj_tag_groups possible_slots = lalib.la_adj_decl_overrides expected_headtemps = ["la-num-adj"] expected_infltemps = ["la-adecl"] else: continue # # We have the following: # # 1. The non-lemma headword, with one or (potentially but unlikely) more # than one headword form. # 2. Under the headword, multiple {{inflection of}} templates, each of # which specifies a single lemma under which the non-lemma form # belongs, and one or more corresponding tag sets. # 3. The lemma page corresponding to the lemma specified in an # {{inflection of}} template may have one or more lemmas of the right # part of speech. Each lemma specifies one or (potentially but # unlikely) more than one lemma form. Some, all or none of the lemmas # might match the lemma specified in the {{inflection of}} template # in macrons (i.e. there's an exact match between the lemma in the # {{inflection of}} template and one of the actual lemma forms of a # lemma on the page). # 4. Under each lemma on the lemma page is one or more inflection # templates specifying the inflections of the lemma. Each inflection # template specifies the non-lemma form(s) (potentially more than one) # for each slot. # # When looking up a given {{inflection of}} template, the ideal case is # that the specified lemma matches one of the actual lemmas, and all # corresponding specified non-lemma forms match the corresponding actual # non-lemma form(s) for all tag sets. (If there are multiple specified # non-lemma forms, they may match across inflection templates if there's # more than one, e.g. the first matches the first inflecion template and # the second matches the second inflection template.) # # What if there are mismatches? # # 1. If the specified non-lemma forms are a subset of the actual # non-lemma forms for a given {{inflection of}} template and lemma, # this is still considered a match but we make a note of it (not a # warning). # 2. If a single {{inflection of}} template has multiple tag sets in it # and and for some but not all tag sets the specified non-lemma forms # match, we consider this a match but issue a warning. (In the future, # we might consider removing the bad tag sets, conditioned on a # separate command-line flag.) # 3. If the specified lemma of a given {{inflection of}} template # doesn't match any actual lemmas, we look at all actual lemmas that # match except for macrons and see if, for any of them, the specified # non-lemma forms match the actual non-lemma forms per (1) and (2). # If so, we gather the set of lemma forms for all such lemmas. If # there's only one, we can update the specified lemma in the # {{inflection of}} template (and issue a warning). If there are # multiple, we issue a warning and don't update the specified lemma. # 4. We first loop through all {{inflection of}} templates for the given # specified non-lemma forms and check for matches according to # (1), (2) and (3). If some but not all templates match, we issue # a warning and we're done with this non-lemma headword. # 5. If there are no matches per (4), we look for the set of actual forms # that match all tag sets of all {{inflection of}} templates when # ignoring macron differences. If there is such a non-empty set, # we can update the specified non-lemma forms in the non-lemma # headword (and issue a warning). When doing so, we may need to # update the corresponding pronunciation template(s), according to # logic still to be determined (FIXME), but similar to or identical to # existing logic in clean_latin_long_vowels.py. # 6. If there are no matches per (5), we first look at the possible # assignments of actual lemmas to each possible {{inflection of}} # template (ignoring macron differences). If there's only one such # assignment (i.e. each {{inflection of}} template can be assigned to # only one actual lemma), then for that assignment, we find the # actual forms that match the non-lemma pagename except in macrons and # are common among all the sets of inflections, and update the # specified non-lemma forms in the non-lemma headword using those # forms (and issue a warning). When doing so, we may need to update # the corresponding pronunciation template(s) as in (5). If there are # no forms in common, issue a warning and do nothing. # 7. If there are multiple assignments of actual lemmas to # {{inflection of}} templates, we loop over all possible assignments. # For each assignment, we find the set of actual common non-lemma # forms as in (6). If there is more than one assignment with a # non-empty set of actual common non-lemma forms, or no assignment, # we issue a warning and do nothing. Otherwise, we update the # specified non-lemma forms in the non-lemma headword (and # corresponding pronunciation template(s)) as in (6). headword_forms = lalib.la_get_headword_from_template( ht, pagetitle, pagemsg) matching_headword_forms = [] for headword_form in headword_forms: if "[" in headword_form or "|" in headword_form: pagemsg( "WARNING: Bracket or pipe symbol in non-lemma headword form, should not happen: %s" % unicode(ht)) headword_form = blib.remove_links(headword_form) if lalib.remove_macrons(headword_form) != pagetitle: pagemsg( "WARNING: Bad headword form %s, doesn't match page title: %s" % (headword_form, unicode(ht))) elif headword_form in matching_headword_forms: pagemsg("WARNING: Duplicate headword form %s: %s" % (headword_form, unicode(ht))) else: matching_headword_forms.append(headword_form) headword_forms = matching_headword_forms for stage in [1, 2, 3]: def stagemsg(txt): pagemsg("Stage %s: %s" % (stage, txt)) def errandstagemsg(txt): errandpagemsg("Stage %s: %s" % (stage, txt)) def yield_infl_of_templates_and_properties(): for t in headword['infl_of_templates']: lang = getparam(t, "lang") if lang: lemma_param = 1 else: lang = getparam(t, "1") lemma_param = 2 if lang != "la": errandstagemsg( "WARNING: In Latin section, found {{inflection of}} for different language %s: %s" % (lang, unicode(t))) continue lemma = getparam(t, str(lemma_param)) if "[" in lemma or "|" in lemma: stagemsg("WARNING: Link in lemma %s, skipping: %s" % (lemma, unicode(t))) continue inflargs_sets = lookup_inflection( lalib.remove_macrons(lemma), pos, expected_headtemps, expected_infltemps, stagemsg, errandstagemsg) if inflargs_sets is None: stagemsg( "WARNING: Lemma %s doesn't exist or has no %s heads" % (lemma, pos)) continue # fetch tags tags = [] for param in t.params: pname = unicode(param.name).strip() pval = unicode(param.value).strip() if re.search("^[0-9]+$", pname): if int(pname) >= lemma_param + 2: if pval: tags.append(pval) # split tags into tag sets (which may be multipart) and further # split any multipart tag sets into component tag sets tag_sets = [ tag_set for maybe_multipart_tag_set in lalib.split_tags_into_tag_sets(tags) for tag_set in lalib.split_multipart_tag_set( maybe_multipart_tag_set) ] yield t, lemma_param, lemma, inflargs_sets, tag_sets def merge_forms_for_slot(slot, this_inflargs): # Merge the forms of all inflection templates under the given # lemma headword all_valid_forms = [] all_valid_forms_with_syncopated = [] for inflargs in this_inflargs: if slot not in inflargs: continue saw_slot_in_inflargs = True forms = inflargs[slot].split(",") valid_forms = [ form for form in forms if "[" not in form and "|" not in form ] for form in valid_forms: if form not in all_valid_forms: all_valid_forms.append(form) if form not in all_valid_forms_with_syncopated: all_valid_forms_with_syncopated.append(form) if pos == "verb" and re.search(u"v[eiē]", form): syncopated_form = re.sub(u"^(.*)v[eiē]", r"\1", form) if syncopated_form not in all_valid_forms_with_syncopated: all_valid_forms_with_syncopated.append( syncopated_form) all_matchable_forms = [ form for form in all_valid_forms if lalib.remove_macrons(form) == pagetitle ] all_matchable_forms_with_syncopated = [ form for form in all_valid_forms_with_syncopated if lalib.remove_macrons(form) == pagetitle ] return (all_valid_forms, all_valid_forms_with_syncopated, all_matchable_forms, all_matchable_forms_with_syncopated) if stage == 1: matched_infl_of_templates = False for t, lemma_param, lemma, inflargs_sets, tag_sets in yield_infl_of_templates_and_properties( ): def check_for_tag_set_match(tag_set, allow_lemma_mismatch): slot = lalib.tag_set_to_slot(tag_set, tag_set_groups, stagemsg) if slot is None: # Already issued warning return [] if slot not in possible_slots: stagemsg( "WARNING: Unrecognized slot %s from tag set: %s" % (slot, unicode(t))) return [] saw_slot_in_inflargs = False matching_actual_lemmas = [] for actual_lemmas, this_inflargs in inflargs_sets: saw_matching_lemma = False for actual_lemma in actual_lemmas: actual_lemma = blib.remove_links(actual_lemma) if (lalib.remove_macrons(lemma) == lalib.remove_macrons(actual_lemma) if allow_lemma_mismatch else lemma == actual_lemma): saw_matching_lemma = True if not saw_matching_lemma: continue (all_valid_forms, all_valid_forms_with_syncopated, all_matchable_forms, all_matchable_forms_with_syncopated) = ( merge_forms_for_slot(slot, this_inflargs)) matched_form = False if set(headword_forms) == set(all_matchable_forms): stagemsg( "Matched headword form(s) %s exactly (slot %s, lemma %s, all valid slot forms(s) %s)" % (",".join(headword_forms), slot, lemma, ",".join(all_valid_forms))) matched_form = True elif set(headword_forms) <= set( all_matchable_forms): stagemsg( "Matched headword form(s) %s as subset of all matchable slot form(s) %s (slot %s, lemma %s, all valid slot forms(s) %s)" % (",".join(headword_forms), ",".join(all_matchable_forms), slot, lemma, ",".join(all_valid_forms))) matched_form = True elif set(headword_forms) == set( all_matchable_forms_with_syncopated): stagemsg( "Matched syncopated headword form(s) %s exactly (slot %s, lemma %s, all valid slot forms(s) + syncopation %s)" % (",".join(headword_forms), slot, lemma, ",".join(all_valid_forms_with_syncopated)) ) matched_form = True elif set(headword_forms) <= set( all_matchable_forms_with_syncopated): stagemsg( "Matched syncopated headword form(s) %s as subset of all matchable slot form(s) + syncopation %s (slot %s, lemma %s, all valid slot forms(s) + syncopation %s)" % (",".join(headword_forms), ",".join( all_matchable_forms_with_syncopated), slot, lemma, ",".join( all_valid_forms_with_syncopated))) matched_form = True if matched_form: for actual_lemma in actual_lemmas: if actual_lemma not in matching_actual_lemmas: matching_actual_lemmas.append( actual_lemma) if not matching_actual_lemmas: if not saw_slot_in_inflargs: if "pasv" in slot: stagemsg( "WARNING: For headword forms %s, didn't see passive slot %s in inflections of lemma %s, probably need to delete passive forms of verb" % (",".join(headword_forms), slot, lemma)) else: stagemsg( "WARNING: For headword forms %s, didn't see slot %s in inflections of lemma %s" % (",".join(headword_forms), slot, lemma)) return matching_actual_lemmas saw_matching_lemma = False for actual_lemmas, this_inflargs in inflargs_sets: if lemma in [ blib.remove_links(x) for x in actual_lemmas ]: saw_matching_lemma = True break if saw_matching_lemma: tag_set_matches = [] tag_set_mismatches = [] for tag_set in tag_sets: matching_lemmas = check_for_tag_set_match( tag_set, allow_lemma_mismatch=False) if matching_lemmas: tag_set_matches.append(tag_set) else: tag_set_mismatches.append(tag_set) if len(tag_set_matches) > 0: matched_infl_of_templates = True if len(tag_set_mismatches) > 0: stagemsg( "WARNING: Matched tag sets %s but not %s, counting as a match: %s" % (",".join("|".join(tag_set) for tag_set in tag_set_matches), ",".join( "|".join(tag_set) for tag_set in tag_set_mismatches), unicode(t))) else: stagemsg( "WARNING: Couldn't match any tag sets: %s" % unicode(t)) else: stagemsg( "WARNING: Couldn't match lemma %s among potential lemmas %s, trying without lemma matches: %s" % (lemma, ",".join( actual_lemma for actual_lemmas, this_inflargs in inflargs_sets for actual_lemma in actual_lemmas), unicode(t))) tag_set_matches = [] tag_set_mismatches = [] all_matching_lemmas = [] for tag_set in tag_sets: matching_lemmas = check_for_tag_set_match( tag_set, allow_lemma_mismatch=True) if matching_lemmas: tag_set_matches.append(tag_set) for matching_lemma in matching_lemmas: if matching_lemma not in all_matching_lemmas: all_matching_lemmas.append( matching_lemma) else: tag_set_mismatches.append(tag_set) if len(tag_set_matches) > 0: matched_infl_of_templates = True if len(all_matching_lemmas) == 1: notes.append( "fix macrons in lemma of '%s' (stage 1): %s -> %s" % (tname(t), lemma, all_matching_lemmas[0])) if len(tag_set_mismatches) > 0: stagemsg( "WARNING: Fixing macrons in lemma %s -> %s despite only some tag sets %s but not %s matching, counting as a match: %s" % (lemma, all_matching_lemmas[0], ",".join( "|".join(tag_set) for tag_set in tag_set_matches), ",".join("|".join(tag_set) for tag_set in tag_set_mismatches), unicode(t))) else: stagemsg( "WARNING: Fixing macrons in lemma %s -> %s; all tag sets match: %s" % (lemma, all_matching_lemmas[0], unicode(t))) origt = unicode(t) t.add(str(lemma_param), all_matching_lemmas[0]) stagemsg("Replaced %s with %s" % (origt, unicode(t))) else: if len(tag_set_mismatches) > 0: stagemsg( "WARNING: Multiple possible lemmas %s match some tag sets %s but not %s, counting as a match but not updating lemma %s: %s" % (",".join(all_matching_lemmas), ",".join( "|".join(tag_set) for tag_set in tag_set_matches), ",".join("|".join(tag_set) for tag_set in tag_set_mismatches), lemma, unicode(t))) else: stagemsg( "WARNING: Multiple possible lemmas %s match tag sets, with all tag sets matching, counting as a match but not updating lemma %s: %s" % (",".join(all_matching_lemmas), lemma, unicode(t))) else: stagemsg( "WARNING: Couldn't match any tag sets even when allowing macron mismatches with lemma %s: %s" % (lemma, unicode(t))) if matched_infl_of_templates: break elif stage == 2: common_forms = None no_common_forms = False for t, lemma_param, lemma, inflargs_sets, tag_sets in yield_infl_of_templates_and_properties( ): for tag_set in tag_sets: slot = lalib.tag_set_to_slot(tag_set, tag_set_groups, stagemsg) if slot is None or slot not in possible_slots: # Already issued warning no_common_forms = True break this_tag_set_matching_forms = [] combined_this_inflargs = [] for actual_lemmas, this_inflargs in inflargs_sets: for actual_lemma in actual_lemmas: actual_lemma = blib.remove_links(actual_lemma) if lemma == actual_lemma: combined_this_inflargs.extend( this_inflargs) break if not combined_this_inflargs: continue (all_valid_forms, all_valid_forms_with_syncopated, all_matchable_forms, all_matchable_forms_with_syncopated) = ( merge_forms_for_slot(slot, combined_this_inflargs)) for form in all_matchable_forms: if form not in this_tag_set_matching_forms: this_tag_set_matching_forms.append(form) if common_forms is None: common_forms = this_tag_set_matching_forms if len(common_forms) == 0: no_common_forms = True break else: new_common_forms = [] for form in common_forms: if form in this_tag_set_matching_forms: new_common_forms.append(form) common_forms = new_common_forms if len(common_forms) == 0: no_common_forms = True break if no_common_forms: break if no_common_forms or common_forms is None: stagemsg( "WARNING: No forms match pagetitle %s across all {{inflection of}} tags and tag sets, not changing headword form(s) but trying again allowing macron differences in lemmas: %s" % (pagetitle, unicode(ht))) else: notes.append( "fix macrons in forms of '%s' (stage 2): %s -> %s" % (tname(ht), ",".join(headword_forms), ",".join(common_forms))) oright = unicode(ht) if tname(ht) == "head": blib.set_param_chain(ht, common_forms, "head", "head") else: blib.set_param_chain(ht, common_forms, "1", "head") stagemsg("Replaced %s with %s" % (oright, unicode(ht))) if len(common_forms) > 1: stagemsg( "WARNING: FIXME: No support yet for pronunciation for multiple headword forms %s" % ",".join(common_forms)) else: assert len(common_forms) == 1 clean_latin_long_vowels.process_pronun_templates( headword['pronun_section'], common_forms[0], stagemsg, notes, "fix macrons in pronun of '%%s' (stage 2): %s -> %s" % (",".join(headword_forms), ",".join(common_forms))) break else: assert stage == 3 multiple_assignments = False infl_of_assignments = [] for t, lemma_param, lemma, inflargs_sets, tag_sets in yield_infl_of_templates_and_properties( ): matching_lemmas = [] for actual_lemmas, this_inflargs in inflargs_sets: for actual_lemma in actual_lemmas: actual_lemma = blib.remove_links(actual_lemma) if lalib.remove_macrons( lemma) == lalib.remove_macrons( actual_lemma): if actual_lemma not in matching_lemmas: matching_lemmas.append(actual_lemma) if len(matching_lemmas) > 1: stagemsg( "WARNING: Multiple actual lemmas %s match {{inflection of}} lemma %s, hence multiple assignments, doing things the hard way: %s" % (",".join(matching_lemmas), lemma, unicode(t))) multiple_assignments = True infl_of_assignments.append(matching_lemmas) cur_assignment = None cur_common_forms = None for assignment in itertools.product(*infl_of_assignments): common_forms = None no_common_forms = False for actual_lemma, ( t, lemma_param, lemma, inflargs_sets, tag_sets) in zip( assignment, yield_infl_of_templates_and_properties()): for tag_set in tag_sets: slot = lalib.tag_set_to_slot( tag_set, tag_set_groups, stagemsg) if slot is None or slot not in possible_slots: # Already issued warning no_common_forms = True break this_tag_set_matching_forms = [] combined_this_inflargs = [] for actual_lemmas, this_inflargs in inflargs_sets: if actual_lemma in actual_lemmas: combined_this_inflargs.extend( this_inflargs) (all_valid_forms, all_valid_forms_with_syncopated, all_matchable_forms, all_matchable_forms_with_syncopated) = ( merge_forms_for_slot( slot, combined_this_inflargs)) for form in all_matchable_forms: if form not in this_tag_set_matching_forms: this_tag_set_matching_forms.append( form) if common_forms is None: common_forms = this_tag_set_matching_forms if len(common_forms) == 0: no_common_forms = True break else: new_common_forms = [] for form in common_forms: if form in this_tag_set_matching_forms: new_common_forms.append(form) common_forms = new_common_forms if len(common_forms) == 0: no_common_forms = True break if no_common_forms: break if not no_common_forms and common_forms is not None: if cur_assignment: stagemsg( "WARNING: Multiple assignments of lemmas have common forms, at least %s -> %s and %s -> %s, not changing: %s" % (",".join(cur_assignment), ",".join(cur_common_forms), ",".join(assignment), ",".join(common_forms), unicode(ht))) else: cur_assignment = assignment cur_common_forms = common_forms if cur_assignment is None: stagemsg( "WARNING: No forms match pagetitle %s across all {{inflection of}} tags and tag sets when allowing macron differences in lemmas, not changing headword form(s): %s" % (pagetitle, unicode(ht))) else: for actual_lemma, ( t, lemma_param, lemma, inflargs_sets, tag_sets) in zip( cur_assignment, yield_infl_of_templates_and_properties()): notes.append( "fix macrons in lemma of '%s' (stage 3): %s -> %s" % (tname(t), lemma, actual_lemma)) stagemsg( "WARNING: found common forms %s, updating lemma %s to %s: %s" % (",".join(cur_common_forms), lemma, actual_lemma, unicode(t))) origt = unicode(t) t.add(str(lemma_param), actual_lemma) stagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append( "fix macrons in forms of '%s' (stage 3): %s -> %s" % (tname(ht), ",".join(headword_forms), ",".join(cur_common_forms))) oright = unicode(ht) if tname(ht) == "head": blib.set_param_chain(ht, cur_common_forms, "head", "head") else: blib.set_param_chain(ht, cur_common_forms, "1", "head") stagemsg("Replaced %s with %s" % (oright, unicode(ht))) if len(cur_common_forms) > 1: stagemsg( "WARNING: FIXME: No support yet for pronunciation for multiple headword forms %s" % ",".join(cur_common_forms)) else: assert len(cur_common_forms) == 1 clean_latin_long_vowels.process_pronun_templates( headword['pronun_section'], cur_common_forms[0], stagemsg, notes, "fix macrons in pronun of '%%s' (stage 3): %s -> %s" % (",".join(headword_forms), ",".join(cur_common_forms))) break secbody = "".join(unicode(x) for x in parsed_subsections) sections[j] = secbody + sectail return "".join(sections), notes
def process_page(page, index, parsed): pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] pagemsg("Processing") for t in parsed.filter_templates(): if tname(t) == "bg-noun-form": origt = unicode(t) must_continue = False for param in t.params: if pname(param) not in ["1", "2", "3", "head"]: pagemsg("WARNING: Saw unrecognized param %s=%s: %s" % (pname(param), unicode(param.value), origt)) must_continue = True break if must_continue: continue rmparam(t, "1") rmparam(t, "2") head = getparam(t, "head") rmparam(t, "head") g = getparam(t, "3") rmparam(t, "3") blib.set_template_name(t, "head") t.add("1", "bg") t.add("2", "noun form") if head: t.add("head", head) else: if bglib.needs_accents(pagetitle): pagemsg("WARNING: Can't add head= to {{bg-noun-form}} missing it because pagetitle is multisyllabic: %s" % unicode(t)) else: t.add("head", pagetitle) if g: t.add("g", g) pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append("replace {{bg-noun-form}} with {{head|bg|noun form}}") headt = None saw_infl_after_head = False saw_headt = False saw_inflt = False for t in parsed.filter_templates(): tn = tname(t) origt = unicode(t) saw_infl = False already_fetched_forms = False if tn == "head" and getparam(t, "1") == "bg" and getparam(t, "2") == "noun form": saw_headt = True if headt and not saw_infl_after_head: pagemsg("WARNING: Saw two head templates %s and %s without intervening inflection" % ( unicode(headt), origt)) saw_infl_after_head = False headt = t if tn == "bg-noun form of": saw_inflt = True if not headt: pagemsg("WARNING: Saw {{bg-noun form of}} without head template: %s" % origt) continue must_continue = False for param in t.params: if pname(param) not in ["1", "2", "3", "noun"]: pagemsg("WARNING: Saw unrecognized param %s=%s: %s" % (pname(param), unicode(param.value), origt)) must_continue = True break if must_continue: continue saw_infl_after_head = True noun = getparam(t, "noun") if not noun: pagemsg("WARNING: Didn't see noun=: %s" % origt) continue infls = [] param2 = getparam(t, "2") if param2 == "indefinite": infls.append("indef") elif param2 == "definite": infls.append("def") elif param2 == "vocative": infls.append("voc") elif param2: pagemsg("WARNING: Saw unrecognized 2=%s: %s" % (param2, origt)) continue param3 = getparam(t, "3") if param3 == "subject": infls.append("sbjv") elif param3 == "object": infls.append("objv") elif param3: pagemsg("WARNING: Saw unrecognized 3=%s: %s" % (param3, origt)) continue param1 = getparam(t, "1") if param1 == "singular": infls.append("s") elif param1 == "plural": infls.append("p") elif param1 == "count": infls.extend(["count", "form"]) elif param1 == "vocative": infls.extend(["voc", "s"]) else: pagemsg("WARNING: Saw unrecognized 1=%s: %s" % (param1, origt)) continue blib.set_template_name(t, "inflection of") del t.params[:] t.add("1", "bg") lemma, forms = snarf_noun_accents_and_forms(noun, pagemsg) if not lemma: pagemsg("WARNING: Unable to find accented equivalent of %s: %s" % (noun, origt)) t.add("2", noun) else: t.add("2", lemma) t.add("3", "") for i, infl in enumerate(infls): t.add(str(i + 4), infl) pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append("convert {{bg-noun form of}} to {{inflection of}}") tn = tname(t) saw_infls = infls_to_slot(infls) already_fetched_forms = True if not saw_infls: pagemsg("WARNING: Unrecognized inflections %s: %s" % ("|".join(infls), origt)) elif tn == "inflection of" and getparam(t, "1") == "bg": saw_inflt = True infls = [] i = 4 while True: infl = getparam(t, str(i)) if not infl: break infls.append(infl) i += 1 saw_infls = infls_to_slot(infls) if not saw_infls: if "vnoun" in infls: pagemsg("Skipping verbal noun inflection %s: %s" % ("|".join(infls), origt)) elif "part" in infls: pagemsg("Skipping participle inflection %s: %s" % ("|".join(infls), origt)) else: pagemsg("WARNING: Unrecognized inflections %s: %s" % ("|".join(infls), origt)) elif tn == "definite singular of" and getparam(t, "1") == "bg": saw_inflt = True saw_infl = "def_sg" elif tn == "indefinite plural of" and getparam(t, "1") == "bg": saw_inflt = True saw_infl = "ind_pl" elif tn == "definite plural of" and getparam(t, "1") == "bg": saw_inflt = True saw_infl = "def_pl" elif tn == "vocative singular of" and getparam(t, "1") == "bg": saw_inflt = True saw_infl = "voc_sg" if saw_infl: if not already_fetched_forms: noun = getparam(t, "2") lemma, forms = snarf_noun_accents_and_forms(noun, pagemsg) if not lemma: pagemsg("WARNING: Unable to find accented equivalent of %s: %s" % (noun, origt)) continue t.add("2", lemma) pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append("replace lemma with accented %s in {{%s}}" % (lemma, tn)) if saw_infl == "def_sg": def_sub_sg = forms.get("def_sub_sg", None) def_obj_sg = forms.get("def_obj_sg", None) if def_sub_sg != def_obj_sg: pagemsg("WARNING: Inflection is def_sg but def_sub_sg %s != def_obj_sg %s" % ( def_sub_sg, def_obj_sg)) continue form = def_sub_sg else: form = forms.get(saw_infl, None) if not form: pagemsg("WARNING: Inflection is %s but couldn't find form among forms: %s" % (saw_infl, format_forms(forms))) continue form = form.split(",") filtered_form = [f for f in form if bglib.remove_accents(f) == pagetitle] if not filtered_form: pagemsg("WARNING: No forms among %s=%s match page title" % (saw_infl, ",".join(form))) continue form = filtered_form existing_form = blib.fetch_param_chain(headt, "head", "head") if existing_form: must_continue = False for f in existing_form: if bglib.remove_accents(f) != pagetitle: pagemsg("WARNING: Existing head %s doesn't match page title: %s" % ( f, unicode(headt))) must_continue = True break if must_continue: continue needs_accents = [bglib.needs_accents(f) for f in existing_form] if any(needs_accents) and not all(needs_accents): pagemsg("WARNING: Some but not all existing heads missing accents: %s" % unicode(headt)) continue if not any(needs_accents): if existing_form != form: pagemsg("WARNING: For inflection %s, existing form(s) %s != new form(s) %s" % ( saw_infl, ",".join(existing_form), ",".join(form))) continue origheadt = unicode(headt) blib.set_param_chain(headt, form, "head", "head") pagemsg("Replaced %s with %s" % (origheadt, unicode(headt))) notes.append("add accented form %s=%s to {{head|bg|noun form}}" % (saw_infl, ",".join(form))) if saw_headt and not saw_inflt: pagemsg("WARNING: Saw head template %s but no inflection template" % unicode(headt)) for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn in template_to_infl_codes and getparam(t, "1") == "bg": must_continue = False for param in t.params: if pname(param) not in ["1", "2"]: pagemsg("WARNING: Saw unrecognized param %s=%s: %s" % (pname(param), unicode(param.value), origt)) must_continue = True break if must_continue: continue infl_codes = template_to_infl_codes[tn] blib.set_template_name(t, "inflection of") t.add("3", "") for i, infl in enumerate(infl_codes): t.add(str(i + 4), infl) pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append("convert {{%s}} to {{inflection of}}" % tn) return unicode(parsed), notes
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) notes = [] if "es-verb" not in text: return if ":" in pagetitle: pagemsg("Skipping non-mainspace title") return pagemsg("Processing") parsed = blib.parse_text(text) for t in parsed.filter_templates(): def getp(param): return getparam(t, param) tn = tname(t) if tn == "es-verb" and args.add_attn and not getp("1"): origt = unicode(t) for param in t.params: pn = pname(param) pv = unicode(param.value) pagemsg("WARNING: No 1= but saw param %s=%s: %s" % (pn, pv, unicode(t))) break t.add("attn", "1") notes.append("add attn=1 to verb with missing 1=") if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) else: pagemsg("No changes to %s" % unicode(t)) continue if tn == "es-verb": origt = unicode(t) lemma = getparam(t, "head") or pagetitle if " " in lemma: pagemsg("WARNING: Space in lemma") prep = getp("prep") shouldlemma = getp("1") + getp("2") + ( "se" if getp("ref") == "y" else "") + (" " + blib.remove_links(prep) if prep else "") if shouldlemma != blib.remove_links(lemma): pagemsg( "WARNING: lemma=%s from 1/2/ref != lemma=%s from head or pagetitle: %s" % (shouldlemma, blib.remove_links(lemma), unicode(t))) continue d = get_def_forms(lemma, prep, pagemsg) if not d: continue if getp("part2") and not getp("part"): pagemsg("WARNING: Saw part2= without part=: %s" % unicode(t)) part = [d["part"], getp("part2")] else: part = blib.fetch_param_chain(t, "part") pres = blib.fetch_param_chain(t, "pres") pret = blib.fetch_param_chain(t, "pret") part = ["+" if x == d["part"] else x for x in part] pret = ["+" if x == d["pret"] else x for x in pret] pres = [ "+" if x == d["pres"] else "+ie" if x == d["pres_ie"] else "+ue" if x == d["pres_ue"] else "+i" if x == d["pres_i"] else u"+í" if x == d["pres_iacc"] else u"+ú" if x == d["pres_uacc"] else x for x in pres ] notes.append("convert {{es-verb}} to new format") if pres == ["+"]: notes.append("remove redundant present from {{es-verb}}") pres = [] if pret == ["+"]: notes.append("remove redundant preterite from {{es-verb}}") pret = [] if part == ["+"]: notes.append("remove redundant participle from {{es-verb}}") part = [] for vowel_var in ["+ie", "+ue", "+i", u"+í", u"+ú"]: if vowel_var in pres: notes.append( "replace vowel-varying present with '%s' in {{es-verb}}" % vowel_var) if "+" in part: notes.append( "replace default participle with '+' in {{es-verb}}") head = getp("head") must_continue = False for param in t.params: pn = pname(param) pv = unicode(param.value) if pn == "1" and pv in ["m", "mf"]: pagemsg("WARNING: Extraneous param %s=%s in %s, ignoring" % (pn, pv, unicode(t))) continue if pn not in [ "head", "1", "2", "ref", "pres", "pret", "part", "part2", "prep" ]: pagemsg("WARNING: Saw unrecognized param %s=%s in %s" % (pn, pv, unicode(t))) must_continue = True break if must_continue: continue del t.params[:] def has_override(forms): return 1 if any(x and not x.startswith("+") for x in forms) else 0 num_overrides = has_override(pres) + has_override( pret) + has_override(part) if d["post"] or (d["refl"] or d["clitic"]) and num_overrides >= 2: main_verb = d["full_verb"] if part: angle_brackets = "<%s,%s,%s>" % ( ":".join(pres), ":".join(pret), ":".join(part)) elif pret: angle_brackets = "<%s,%s>" % (":".join(pres), ":".join(pret)) elif pres: angle_brackets = "<%s>" % (":".join(pres)) else: angle_brackets = "<>" if angle_brackets == "<>": if head: t.add("head", head) else: arg1 = "%s%s%s" % (main_verb, angle_brackets, d["post"] or "") t.add("1", arg1) else: if head: t.add("head", head) pres = [ make_verb_form_full(x, d["clitic"], d["refl"], "", is_part=False, do_link=True) for x in pres ] pret = [ make_verb_form_full(x, d["clitic"], d["refl"], "", is_part=False, do_link=True) for x in pret ] part = [ make_verb_form_full(x, d["clitic"], d["refl"], "", is_part=True, do_link=True) for x in part ] blib.set_param_chain(t, pres, "pres") blib.set_param_chain(t, pret, "pret") blib.set_param_chain(t, part, "part") if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) else: pagemsg("No changes to %s" % unicode(t)) return unicode(parsed), notes
def process_page(page, index, parsed): global args verbose = args.verbose pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) pagemsg("Processing") text = unicode(page.text) parsed = blib.parse(page) notes = [] hascomp = False headword_templates = [] decl_templates = [] for t in parsed.filter_templates(): if unicode(t.name) == "ru-adj": headword_templates.append(t) if getparam(t, "2"): hascomp = True elif getparam(t, "comp2") or getparam(t, "comp3") or getparam( t, "comp4") or getparam(t, "comp5"): pagemsg("WARNING: Found compN= but no 2=: %s" % unicode(t)) if unicode(t.name) == "ru-decl-adj": decl_templates.append(t) if hascomp: if len(headword_templates) > 1 or len(decl_templates) > 1: pagemsg( "WARNING: Found comparative and multiple headword or decl templates, can't proceed" ) elif len(decl_templates) == 1 and not headword_templates: pagemsg( "WARNING: Strange, decl template but no headword template: %s" % unicode(decl_templates[0])) elif len(headword_templates) == 1 and not decl_templates: pagemsg( "WARNING: Strange, headword template but no decl template: %s" % unicode(headword_templates[0])) elif pagetitle.endswith(u"ся"): pagemsg( "WARNING: Comparative with reflexive adjective, not sure what to do: %s" % unicode(headword_templates[0])) else: head = getparam(decl_templates[0], "1") decl = getparam(decl_templates[0], "2") if decl == "-" or decl == "?" or not decl: pagemsg( "WARNING: Found comparative with no short decl '%s': %s" % (decl, getparam(headword_templates[0], "2"))) compspec = "+" else: decl = re.sub(r"\*", "", decl) decl = re.sub(r"\([12]\)", "", decl) decl = set(re.sub(":.*", "", x) for x in re.split(",", decl)) if len(decl) > 1: pagemsg( "WARNING: Found multiple short declensions, not sure what to do: %s (reduced to %s)" % getparam(decl_templates[0], "2"), ",".join(decl)) return decl = list(decl)[0] if not re.search("^[abc]'*$", decl): pagemsg( "WARNING: Strange canonicalized decl %s (orig %s), don't know what to do" % (decl, getparam(decl_templates[0], "2"))) return if (decl == "a" and not pagetitle.endswith(u"ой") or decl == "b" and pagetitle.endswith(u"ой")): compspec = "+" else: compspec = "+" + decl comparatives = expand_text( "{{#invoke:ru-headword|generate_comparative|%s|%s}}" % (head, compspec)) if not comparatives: # Already output warning return comparatives = [ re.sub("//.*", "", x) for x in re.split(",", comparatives) ] unique_comparatives = [] for comp in comparatives: if comp not in unique_comparatives: unique_comparatives.append(comp) origt = unicode(headword_templates[0]) existing_comparatives = [] compparams = [] i = 0 while True: compparam = "2" if i == 0 else "comp" + str(i + 1) existing_comp = getparam(headword_templates[0], compparam) if not existing_comp: break existing_comparatives.append(existing_comp) compparams.append(compparam) i += 1 if "peri" in existing_comparatives: if len(existing_comparatives) > 1: pagemsg( "WARNING: 'peri' along with other explicit comparatives, not sure what to do: %s" % ",".join(existing_comparatives)) elif any(x.startswith("+") for x in existing_comparatives): if len(existing_comparatives) > 1: pagemsg( "WARNING: auto-comparative along with other explicit comparatives, not sure what to do: %s" % ",".join(existing_comparatives)) elif existing_comparatives != unique_comparatives: pagemsg( "WARNING: Explicit comparative(s) %s not same as auto-generated %s" % (",".join(existing_comparatives), ",".join(unique_comparatives))) else: superlatives = blib.fetch_param_chain(headword_templates[0], "3", "sup") blib.remove_param_chain(headword_templates[0], "3", "sup") for compparam in compparams: rmparam(headword_templates[0], compparam) headword_templates[0].add("2", compspec) blib.set_param_chain(headword_templates[0], superlatives, "3", "sup") pagemsg("Replaced %s with %s" % (origt, unicode(headword_templates[0]))) notes.append("replaced explicit comparative %s with %s" % (",".join(existing_comparatives), compspec)) return unicode(parsed), notes
def process_page(page, index, parsed): pagetitle = unicode(page.title()) global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) notes = [] pagemsg("Processing") heads = None headt = None headtn = None gender_and_animacy = None genitives = None plurals = None for t in parsed.filter_templates(): tn = tname(t) if tn in [args.lang + "-noun", args.lang + "-proper noun"]: if heads: pagemsg( "WARNING: Encountered headword twice without declension: %s" % unicode(t)) return headt = t headtn = tn heads = blib.fetch_param_chain(t, "1", "head") gender_and_animacy = blib.fetch_param_chain(t, "2", "g") genitives = blib.fetch_param_chain(t, "3", "gen") plurals = blib.fetch_param_chain(t, "4", "pl") genitive_plurals = blib.fetch_param_chain(t, "5", "genpl") if tn == args.lang + "-ndecl": if not heads: pagemsg("WARNING: Encountered decl without headword: %s" % unicode(t)) return generate_template = re.sub( r"^\{\{%s-ndecl\|" % args.lang, "{{User:Benwing2/%s-generate-prod-noun-props|" % args.lang, unicode(t)) result = expand_text(generate_template) if not result: return new_forms = blib.split_generate_args(result) new_g = new_forms["g"].split(",") def compare(old, new, stuff, nocanon=False): if not old: return True if not nocanon: remove_monosyllabic_accents = ( uk.remove_monosyllabic_stress if args.lang == "uk" else be.remove_monosyllabic_accents) old = [ remove_monosyllabic_accents(blib.remove_links(x)) for x in old ] new = [remove_monosyllabic_accents(x) for x in new] if set(old) != set(new): pagemsg( "WARNING: Old %ss %s disagree with new %ss %s: head=%s, decl=%s" % (stuff, ",".join(old), stuff, ",".join(new), unicode(headt), unicode(t))) return False return True if not compare(gender_and_animacy, new_g, "gender", nocanon=True): heads = None continue is_plural = [x.endswith("-p") for x in new_g] if any(is_plural) and not all(is_plural): pagemsg( "WARNING: Mixture of plural-only and non-plural-only genders, can't process: %s" % unicode(t)) return is_plural = any(is_plural) if is_plural: if (not compare(heads, new_forms.get("nom_p", "-").split(","), "nom pl") or not compare(genitives, new_forms.get("gen_p", "-").split(","), "gen pl")): heads = None continue else: if (not compare(heads, new_forms.get("nom_s", "-").split(","), "nom sg") or not compare(genitives, new_forms.get("gen_s", "-").split(","), "gen sg") or # 'uk/be-proper noun' headwords don't have nominative plural set headtn == args.lang + "-noun" and not compare( plurals, new_forms.get("nom_p", "-").split(","), "nom pl") or headtn == args.lang + "-noun" and not compare( genitive_plurals, new_forms.get("gen_p", "-").split(","), "gen pl")): heads = None continue decl = getparam(t, "1") blib.set_param_chain(headt, [decl], "1", "head") blib.remove_param_chain(headt, "2", "g") blib.remove_param_chain(headt, "3", "gen") blib.remove_param_chain(headt, "4", "pl") blib.remove_param_chain(headt, "5", "genpl") notes.append("convert {{%s}} to new style using decl %s" % (unicode(headt.name), decl)) heads = None return unicode(parsed), notes
def process_page(page, index, parsed): global args pagetitle = unicode(page.title()) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) pagemsg("Processing") notes = [] for t in parsed.filter_templates(): tn = tname(t) if tn not in ["la-noun", "la-proper noun"]: continue origt = unicode(t) def render_headword(): return "headword template <from> %s <to> %s <end>" % (origt, origt) if getparam(t, "indecl"): pagemsg("Skipping indeclinable noun: %s" % render_headword()) continue new_style_headword_template = (not getparam(t, "head2") and not getparam(t, "2") and not getparam(t, "3") and not getparam(t, "4") and not getparam(t, "decl")) if new_style_headword_template: pagemsg("Skipping new-style template: %s" % render_headword()) continue lemma = blib.fetch_param_chain(t, ["1", "head", "head1"], "head") or [pagetitle] genitive = blib.fetch_param_chain(t, ["2", "gen", "gen1"], "gen") noun_gender = blib.fetch_param_chain(t, ["3", "g", "g1"], "g") noun_decl = blib.fetch_param_chain(t, ["4", "decl", "decl1"], "decl") if " " in lemma[0]: pagemsg("WARNING: Space in lemma %s, skipping: %s" % (lemma[0], render_headword())) continue if len(lemma) > 1: pagemsg("WARNING: Multiple lemmas %s, skipping: %s" % (",".join(lemma), render_headword())) continue lemma = lemma[0] noun_decl_to_decl_type = { "first": "1", "second": "2", "third": "3", "fourth": "4", "fifth": "5", "irregular": "irreg", } if len(noun_decl) == 0: pagemsg("WARNING: No declension, skipping: %s" % render_headword()) continue if len(noun_decl) > 1: pagemsg("WARNING: Multiple decls %s, skipping: %s" % (",".join(noun_decl), render_headword())) continue noun_decl = noun_decl[0] if noun_decl not in noun_decl_to_decl_type: pagemsg("WARNING: Unrecognized declension %s, skipping: %s" % (noun_decl, render_headword())) continue decl_type = noun_decl_to_decl_type[noun_decl] if decl_type in ["1", "2", "4", "5"]: param1 = "%s<%s>" % (lemma, decl_type) elif decl_type == "3": if len(genitive) == 0: pagemsg( "WARNING: No genitives with decl 3 lemma %s, skipping: %s" % (lemma, render_headword())) continue elif len(genitive) > 1: pagemsg( "WARNING: Multiple genitives %s with decl 3 lemma %s, skipping: %s" % (",".join(genitive), lemma, render_headword())) continue else: gen1 = genitive[0] if gen1.endswith("is"): stem = gen1[:-2] if lalib.infer_3rd_decl_stem(lemma) == stem: param1 = "%s<3>" % lemma else: param1 = "%s/%s<3>" % (lemma, stem) elif gen1.endswith("ium"): if lemma.endswith("ia"): param1 = "%s<3.pl>" % lemma elif lemma.endswith(u"ēs"): param1 = "%s<3.I.pl>" % lemma else: pagemsg( "WARNING: Unrecognized lemma %s with decl 3 genitive -ium, skipping: %s" % (lemma, render_headword())) continue elif gen1.endswith("um"): if lemma.endswith("a") or lemma.endswith(u"ēs"): param1 = "%s<3.pl>" % lemma else: pagemsg( "WARNING: Unrecognized lemma %s with decl 3 genitive -um, skipping: %s" % (lemma, render_headword())) continue else: pagemsg( "WARNING: Unrecognized genitive %s with decl 3 lemma %s, skipping: %s" % (gen1, lemma, render_headword())) continue elif decl_type == "irreg": pagemsg("WARNING: Can't handle irregular nouns, skipping: %s" % render_headword()) continue else: pagemsg( "WARNING: Something wrong, unrecognized decl_type %s, skipping: %s" % (decl_type, render_headword())) continue la_ndecl = "{{la-ndecl|%s}}" % param1 noun_props = convert_la_headword_noun.new_generate_noun_forms( la_ndecl, errandpagemsg, expand_text, include_props=True) if noun_props is None: continue decl_gender = noun_props.get("g", None) if not convert_la_headword_noun.compare_headword_decl_forms( "genitive", genitive, ["gen_sg", "gen_pl"], noun_props, render_headword(), pagemsg, adjust_for_missing_gen_forms=True, adjust_for_e_ae_gen=True, remove_headword_links=True): continue if len(noun_gender) == 1 and noun_gender[0] == decl_gender: need_explicit_gender = False else: need_explicit_gender = True if len(noun_gender) > 1: pagemsg( "WARNING: Saw multiple headword genders %s, please verify: %s" % (",".join(noun_gender), render_headword())) elif (noun_gender and noun_gender[0].startswith("n") != (decl_gender == "n")): pagemsg( "WARNING: Headword gender %s is neuter and decl gender %s isn't, or vice-versa, need to correct, skipping: %s" % (noun_gender[0], decl_gender, render_headword())) continue # Fetch remaining params from headword template headword_params = [] for param in t.params: pname = unicode(param.name) if pname.strip() in ["1", "2", "3", "4"] or re.search( "^(head|gen|g|decl)[0-9]*$", pname.strip()): continue headword_params.append((pname, param.value, param.showkey)) # Erase all params del t.params[:] # Add param1 t.add("1", param1) # Add explicit gender if needed if need_explicit_gender: explicit_genders = [] for ng in noun_gender: ng = ng[0] if ng not in explicit_genders: explicit_genders.append(ng) blib.set_param_chain(t, explicit_genders, "g", "g") # Copy remaining params from headword template for name, value, showkey in headword_params: t.add(name, value, showkey=showkey, preserve_spacing=False) pagemsg("Replaced %s with %s" % (origt, unicode(t))) notes.append( "convert {{la-noun}}/{{la-proper noun}} params to new style") return unicode(parsed), notes
def process_noun_headt(t, declt=None): origt = unicode(t) origdeclt = declt and unicode(declt) or "None" def getp(param): return getparam(t, param) if tname(t) == "head": pos = getp("2") head = getp("head") headtr = getp("tr") g = getp("g") g2 = getp("g2") g3 = getp("g3") anim = "" decl = "" gen = "" gentr = "" pl = "" pltr = "" f = "" ftr = "" m = "" mtr = "" collective = "" collectivetr = "" must_continue = False for param in t.params: pn = pname(param) if pn not in ["1", "2", "head", "tr", "g", "g2", "g3", # extra params to ignore "sc"]: pagemsg("WARNING: Unrecognized param %s=%s, skipping: %s" % (pn, unicode(param.value), origt)) must_continue = True break if must_continue: return False else: pos = getp("pos") head = getp("1") or getp("head") or getp("sg") headtr = getp("tr") g = getp("2") or getp("g") g2 = getp("g2") g3 = getp("g3") anim = getp("a") decl = getp("decl") gen = getp("gen") or getp("3") gentr = getp("gentr") pl = getp("pl") or getp("4") pltr = getp("pltr") f = getp("f") ftr = getp("ftr") m = getp("m") mtr = getp("mtr") collective = getp("collective") collectivetr = getp("collectivetr") must_continue = False for param in t.params: pn = pname(param) if pn not in ["pos", "1", "head", "sg", "tr", "2", "g", "g2", "g3", "a", "decl", "gen", "gentr", "3", "pl", "pltr", "4", "f", "ftr", "m", "mtr", "collective", "collectivetr", # extra params to ignore "sc"]: pagemsg("WARNING: Unrecognized param %s=%s, skipping: %s" % (pn, unicode(param.value), origt)) must_continue = True break if must_continue: return False def clean_gender(g): gparts = g.split("-") realg = "?" realan = "?" realpl = "" for part in gparts: if part in ["m", "f", "n"]: realg = part elif part in ["an", "in"]: realan = part elif part == "p": realpl = part elif part != "?": pagemsg("WARNING: Encountered unrecognized gender part '%s' in gender '%s': %s" % ( part, g, origt)) an = anim if an in ["a", "an"]: an = "an" elif an in ["i", "in"]: an = "in" elif an: pagemsg("WARNING: Unrecognized animacy a=%s: %s" % (an, origt)) an = "?" if realan != "?" and an and an != "?" and an != realan: pagemsg("WARNING: Animacy mismatch, anim %s in gender spec %s but a=%s: %s" % ( realan, g, anim, origt)) if realan == "?" and an: realan = an pl = "" if realpl: pl = "-%s" % realpl if realg == "?": pagemsg("WARNING: Unknown gender in gender spec %s: %s" % (g, origt)) if realan == "?": pagemsg("WARNING: Unknown animacy in gender spec %s and a=%s: %s" % (g, anim, origt)) if realg == "?" and realan == "?": return "?%s" % pl else: return "%s-%s%s" % (realg, realan, pl) if not g and not g2 and not g3: pagemsg("WARNING: No gender specified: %s" % origt) g = "?" genders = [] if g: genders.append(clean_gender(g)) if g2: genders.append(clean_gender(g2)) if g3: genders.append(clean_gender(g3)) if not head: head = pagetitle if decl and decl not in ["off", "no", "indeclinable"]: pagemsg("WARNING: Unrecognized value for decl=%s: %s" % (decl, origt)) decl = "" if decl: if gen and gen != "-": pagemsg("WARNING: Indeclinable but gen=%s specified: %s" % (gen, origt)) else: gen = "-" del t.params[:] if tname(t) == "head": blib.set_template_name(t, "be-" + pos) elif pos: t.add("pos", pos) def split_form(form): forms = re.split(r",\s*", form.strip()) forms = [re.sub(r"^\[\[([^\[\]]*)\]\]$", r"\1", f) for f in forms] forms = [belib.add_accent_to_o(f) for f in forms] for f in forms: if "[[" in f: pagemsg("WARNING: Link in form %s: headword=%s, decl=%s" % (f, origt, origdeclt)) if belib.needs_accents(f): pagemsg("WARNING: Form %s missing accents: headword=%s, decl=%s" % (f, origt, origdeclt)) forms = [f for f in forms if f != "-"] return forms def handle_multiform(firstparam, restparam, form, formtr, declparam=None): if form: form = split_form(form) if declparam: if declparam == "-": declforms = ["-"] else: declforms = split_form(getparam(declt, declparam)) if not form: form = declforms elif set(form) != set(declforms): pagemsg("WARNING: For %s=, headword form(s) %s disagree with decl form(s) %s: headword=%s, decl=%s" % (restparam, ",".join(form), ",".join(declforms), origt, origdeclt)) if form: blib.set_param_chain(t, form, firstparam, restparam) if formtr: trparam = ("" if restparam == "head" else restparam) + "tr" if not form: pagemsg("WARNING: Saw %s=%s but no %s=: %s" % ("trparam", formtr, restparam, origt)) elif len(form) > 1: pagemsg("WARNING: Saw %s=%s and multiple %ss %s: %s" % (trparam, formtr, restparam, ",".join(form), origt)) t.add(trparam, formtr) decl_headparam = None decl_genparam = None decl_plparam = None if declt: decl_headparam = "1" tn = tname(declt) if tn == "be-decl-noun": decl_genparam = "3" decl_plparam = "2" elif tn == "be-decl-noun-unc": decl_genparam = "2" decl_plparam = "-" else: decl_genparam = "2" if tn == "be-decl-noun-pl": for g in genders: if not g.endswith("-p"): pagemsg("WARNING: Mismatch between headword gender %s and decl template %s: %s" % ( g, unicode(declt), origt)) else: for g in genders: if g.endswith("-p"): pagemsg("WARNING: Mismatch between headword gender %s and decl template %s: %s" % ( g, unicode(declt), origt)) handle_multiform("1", "head", head, headtr, decl_headparam) blib.set_param_chain(t, genders, "2", "g") handle_multiform("3", "gen", gen, gentr, decl_genparam) if not getp("3") and pl: t.add("3", "") handle_multiform("4", "pl", pl, pltr, decl_plparam) handle_multiform("m", "m", m, mtr) handle_multiform("f", "f", f, ftr) handle_multiform("collective", "collective", collective, collectivetr) if origt != unicode(t): notes.append("fix up {{%s}} to use new param convention" % tname(t)) pagemsg("Replaced %s with %s" % (origt, unicode(t))) return True
def process_page_section(index, page, section, verbose): pagetitle = unicode(page.title()) subpagetitle = re.sub("^.*:", "", pagetitle) def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, verbose) if not page.exists(): pagemsg("WARNING: Page doesn't exist, skipping") return None parsed = blib.parse_text(section) noun_table_templates = [] noun_old_templates = [] for t in parsed.filter_templates(): if unicode(t.name) == "ru-decl-noun-see": pagemsg("Found ru-decl-noun-see, skipping") return None for t in parsed.filter_templates(): if unicode(t.name) == "ru-noun-table": noun_table_templates.append(t) if unicode(t.name) == "ru-noun-old": noun_old_templates.append(t) if len(noun_table_templates) > 1: pagemsg("WARNING: Found multiple ru-noun-table templates, skipping") return None if len(noun_old_templates) > 1: pagemsg("WARNING: Found multiple ru-noun-old templates, skipping") return None if len(noun_table_templates) < 1: if noun_old_templates: pagemsg("WARNING: No ru-noun-table templates but found ru-noun-old template(s): %s" % ", ".join(unicode(x) for x in noun_old_templates)) return unicode(parsed), 0, 0, 0, 0 for t in parsed.filter_templates(): if unicode(t.name) in ["ru-noun", "ru-proper noun"]: pagemsg("Found ru-noun or ru-proper noun, skipping") return None headword_templates = [] for t in parsed.filter_templates(): if unicode(t.name) in ["ru-noun+", "ru-proper noun+"]: headword_templates.append(t) if len(headword_templates) > 1: pagemsg("WARNING: Found multiple headword templates, skipping") return None if len(headword_templates) < 1: return unicode(parsed), 0, 0, 0, 0 noun_table_template = noun_table_templates[0] noun_old_template = noun_old_templates[0] if len(noun_old_templates) == 1 else None headword_template = headword_templates[0] decl_templates = [x for x in [noun_table_template, noun_old_template] if x] if verbose: pagemsg("Found headword template: %s" % unicode(headword_template)) pagemsg("Found decl template: %s" % unicode(noun_table_template)) if noun_old_template: pagemsg("Found old decl template: %s" % unicode(noun_old_template)) orig_headword_template = unicode(headword_template) orig_noun_table_template = unicode(noun_table_template) genders = blib.fetch_param_chain(headword_template, "g", "g") masculines = blib.fetch_param_chain(headword_template, "m", "m") feminines = blib.fetch_param_chain(headword_template, "f", "f") notrcat = getparam(headword_template, "notrcat") filtered_headword_params = [] for param in headword_template.params: name = unicode(param.name) if re.search("^[gmf][0-9]*$", name) or name == "notrcat": pass else: filtered_headword_params.append((param.name, param.value)) filtered_headword_template = blib.parse_text("{{ru-noun+}}").filter_templates()[0] for name, value in filtered_headword_params: filtered_headword_template.add(name, value) ru_noun_table_cleaned = 0 ru_noun_table_link_copied = 0 ru_noun_changed = 0 ru_proper_noun_changed = 0 new_decl_params = [] for param in noun_table_template.params: name = unicode(param.name) if re.search("^[gmf][0-9]*$", name): pagemsg("WARNING: Found g=, m= or f= in noun-table, removing: %s" % unicode(noun_table_template)) else: new_decl_params.append((param.name, param.value)) del noun_table_template.params[:] for name, value in new_decl_params: noun_table_template.add(name, value) if orig_noun_table_template != unicode(noun_table_template): ru_noun_table_cleaned = 1 modified_noun_table_template = blib.parse_text("{{ru-noun-table}}").filter_templates()[0] for param in noun_table_template.params: modified_noun_table_template.add(param.name, param.value) # If proper noun and n is both then we need to add n=both because # proper noun+ defaults to n=sg if unicode(headword_template.name) == "ru-proper noun+": generate_template = re.sub(r"^\{\{ru-noun-table", "{{ru-generate-noun-args", unicode(noun_table_template)) generate_result = expand_text(generate_template) if not generate_result: pagemsg("WARNING: Error generating noun args, skipping") return None args = blib.split_generate_args(generate_result) # If proper noun and n is both then we need to add n=both because # proper noun+ defaults to n=sg if args["n"] == "b" and not getparam(modified_noun_table_template, "n"): pagemsg("Adding n=both to headword template") modified_noun_table_template.add("n", "both") # Correspondingly, if n is sg then we can usually remove n=sg; # but we need to check that the number is actually sg with n=sg # removed because of the possibility of plurale tantum lemmas if args["n"] == "s": generate_template_with_ndef = generate_template.replace("}}", "|ndef=sg}}") generate_template_with_ndef = re.sub(r"\|n=s[^=|{}]*", "", generate_template_with_ndef) generate_result = expand_text(generate_template_with_ndef) if not generate_result: pagemsg("WARNING: Error generating noun args, skipping") return None ndef_args = blib.split_generate_args(generate_result) if ndef_args["n"] == "s": existing_n = getparam(headword_template, "n") if existing_n and not re.search(r"^s", existing_n): pagemsg("WARNING: Something wrong: Found n=%s, not singular" % existing_n) pagemsg("Removing n=sg from headword template") rmparam(modified_noun_table_template, "n") else: pagemsg("WARNING: Unable to remove n= from headword template because n=%s" % ndef_args["n"]) new_headword_template = re.sub(r"^\{\{ru-noun-table", "{{ru-noun+", unicode(modified_noun_table_template)) existing_filtered_headword_template = unicode(filtered_headword_template) change_existing_headword = False if existing_filtered_headword_template != new_headword_template: if "[" in existing_filtered_headword_template and "[" not in new_headword_template: if blib.remove_links(existing_filtered_headword_template) == new_headword_template: pagemsg("Headword has links but decl doesn't and they're otherwise the same, copying headword to decl") del noun_table_template.params[:] for param in filtered_headword_template.params: noun_table_template.add(param.name, param.value) ru_noun_table_link_copied = 1 ru_noun_table_cleaned = 0 else: pagemsg("WARNING: Existing headword template %s would be overwritten with %s but links would be erased, not doing it, check manually" % (existing_filtered_headword_template, new_headword_template)) return None else: pagemsg("WARNING: Existing headword template %s will be overwritten with %s" % (existing_filtered_headword_template, new_headword_template)) change_existing_headword = True if change_existing_headword: del headword_template.params[:] for param in modified_noun_table_template.params: headword_template.add(param.name, param.value) blib.set_param_chain(headword_template, genders, "g", "g") blib.set_param_chain(headword_template, masculines, "m", "m") blib.set_param_chain(headword_template, feminines, "f", "f") if notrcat: headword_template.add("notrcat", notrcat) #genders = runounlib.check_old_noun_headword_forms(headword_template, args, # subpagetitle, pagemsg) #if genders == None: # return None #new_params = [] #for param in noun_table_template.params: # new_params.append((param.name, param.value)) #params_to_preserve = runounlib.fix_old_headword_params(headword_template, # new_params, genders, pagemsg) #if params_to_preserve == None: # return None new_noun_table_template = unicode(noun_table_template) if new_noun_table_template != orig_noun_table_template: pagemsg("Replacing noun table %s with %s" % (orig_noun_table_template, new_noun_table_template)) new_headword_template = unicode(headword_template) if new_headword_template != orig_headword_template: pagemsg("Replacing headword %s with %s" % (orig_headword_template, new_headword_template)) if unicode(headword_template.name) == "ru-noun+": ru_noun_changed = 1 else: ru_proper_noun_changed = 1 return unicode(parsed), ru_noun_table_cleaned, ru_noun_table_link_copied, ru_noun_changed, ru_proper_noun_changed
def process_text_on_page(index, pagetitle, text): global args def pagemsg(txt): msg("Page %s %s: %s" % (index, pagetitle, txt)) def errandpagemsg(txt): errandmsg("Page %s %s: %s" % (index, pagetitle, txt)) def expand_text(tempcall): return blib.expand_text(tempcall, pagetitle, pagemsg, args.verbose) notes = [] if "it-IPA" not in text: return if ":" in pagetitle: pagemsg("Skipping non-mainspace title") return pagemsg("Processing") parsed = blib.parse_text(text) headt = None saw_decl = False for t in parsed.filter_templates(): origt = unicode(t) tn = tname(t) if tn == "it-IPA": pagemsg("Saw %s" % unicode(t)) if getparam(t, "voiced2"): pagemsg("WARNING: Can't yet handle voiced2=%s" % getparam(t, "voiced2")) continue specified_pronuns = blib.fetch_param_chain(t, "1", "") pronuns = specified_pronuns or [pagetitle] frobbed_pronuns = [] must_continue = False for ipa in pronuns: ipa = unicodedata.normalize("NFD", ipa) if AC not in ipa and GR not in ipa: vowel_count = len([x for x in ipa if x in vowel]) if vowel_count == 1: pagemsg("WARNING: Single-vowel word") if vowel_count > 1: new_ipa = re.sub( "(" + vowel_c + ")(" + not_vowel_c + "*[iyu]?" + vowel_c + not_vowel_c + "*)$", lambda m: m.group(1) + (AC if m.group(1) in u"eoɛɔ" else GR) + m.group(2), ipa) if new_ipa == ipa: pagemsg("WARNING: Unable to add stress: %s" % ipa) else: notes.append( unicodedata.normalize( "NFC", "add stressed form %s to defaulted {{it-IPA}} pronun" % new_ipa)) ipa = new_ipa if "z" in ipa: frobbed_ipa = re.sub("i(" + vowel_c + ")", r"j\1", ipa) frobbed_ipa = re.sub("u(" + vowel_c + ")", r"w\1", frobbed_ipa) split_frobbed_ipa = re.split("(z+)", frobbed_ipa) split_z = re.split("(z+)", ipa) voiced = getparam(t, "voiced") if voiced not in ["y", "yes", "1", ""]: pagemsg("WARNING: Unrecognized voiced=%s" % voiced) must_continue = True break for i in xrange(1, len(split_z), 2): if split_z[i - 1].endswith("d"): continue # already converted appropriately default_voiced = False if voiced in ["y", "yes"] or i == 1 and voiced == "1": default_voiced = True elif i == 1 and split_frobbed_ipa[0] == "": if re.search("^[ij]" + stress_c + "?" + vowel_c, split_frobbed_ipa[2]): default_voiced = False elif re.search( "^" + vowel_c + stress_c + "?" + vowel_c, split_frobbed_ipa[2]): default_voiced = True else: if (split_frobbed_ipa[i] == "z" and re.search(vowel_c + stress_c + "?$", split_frobbed_ipa[i - 1]) and re.search("^" + vowel_c, split_frobbed_ipa[i + 1])): default_voiced = True if re.search("i" + CFLEX, split_frobbed_ipa[i + 1]): default_voiced = False if default_voiced: z_to_voiced = {"z": "dz", "zz": "ddz"} split_z[i] = z_to_voiced.get( split_z[i], split_z[i]) else: z_to_voiceless = {"z": "ts", "zz": "tts"} split_z[i] = z_to_voiceless.get( split_z[i], split_z[i]) new_ipa = "".join(split_z) if new_ipa != ipa: notes.append( unicodedata.normalize( "NFC", "convert z to ts or dz in %s -> %s in {{it-IPA}}" % (ipa, new_ipa))) ipa = new_ipa new_ipa = ipa.replace(u"ʦ", "ts") new_ipa = new_ipa.replace(u"ʣ", "dz") if new_ipa != ipa: notes.append( u"normalize ʦ/ʣ to ts/dz in {{it-IPA}} pronun") ipa = new_ipa ipa = unicodedata.normalize("NFC", ipa) # module special-cases -izzare new_ipa = re.sub(u"iddz[àá]re", "izzare", ipa) if new_ipa != ipa: notes.append( u"normalize -iddzàre to -izzare in {{it-IPA}}") ipa = new_ipa new_ipa = ipa.replace(u"á", u"à").replace(u"í", u"ì").replace(u"ú", u"ù") if new_ipa != ipa: notes.append( unicodedata.normalize( "NFC", u"normalize stress in %s in {{it-IPA}}" % ipa)) ipa = new_ipa frobbed_pronuns.append(ipa) if must_continue: continue if frobbed_pronuns == [pagetitle]: frobbed_pronuns = [] if specified_pronuns: notes.append( "remove explicitly specified pronun in {{it-IPA}} because same as page title" ) blib.set_param_chain(t, frobbed_pronuns, "1", "") if t.has("voiced"): rmparam(t, "voiced") notes.append("remove voiced= in {{it-IPA}}") if origt != unicode(t): pagemsg("Replaced %s with %s" % (origt, unicode(t))) return unicode(parsed), notes